Skip to content

Commit

Permalink
osc/rdma: adjustment on btl selection logic
Browse files Browse the repository at this point in the history
This patch makes the following adjustments to the btl selection logic.

First, when selecting a primary btl, this patch requires the primary
btl to support remote completion.

Second, when selecting alternate btls, this patch allowes any
btl to be used as an alternate btl. Prior to this patch,
only a list of pre-defined btls can be used as alternate btls.

Finally, when a btl is used as an atlernate btl, this patch disables
its native atomic and RDMA support, and always use active message
RDMA/atomics with it.

The reason for these changes are:

First, these changes ensured the selected btls of osc/rdma always support
remote completion (because active message RDMA/atomics supports remote completion
when an btl's native RDMA is disabled). Remote completion is essential
for several key components of osc/rdma: the usage of cpu atomics to update
peer's state, the usage of local leader to update peer's state and its
fence implementation. Therefore the assurance of btl's support of
remote completion greatly simplified osc/rdma's implementation.

Second, these changes eliminated the need to save and exchange more
than 1 memory reigstration, because active message RDMA/atomic does
not require explicit memory registration.

Third, these changes ensured the correctness of atomic operations.
When multiple alternate btls are used, atomicity cannot be guarenteed
accross each btl's native implementation of atomics.

Finally, these changes allowed more btls to be used as alternate
btls, especially btl/self.

Signed-off-by: Wei Zhang <[email protected]>
  • Loading branch information
wzamazon committed Nov 23, 2021
1 parent b277aba commit b334866
Showing 1 changed file with 51 additions and 46 deletions.
97 changes: 51 additions & 46 deletions ompi/mca/osc/rdma/osc_rdma_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,6 @@ static const char* ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, co

static char *ompi_osc_rdma_btl_names;
static char *ompi_osc_rdma_mtl_names;
static char *ompi_osc_rdma_btl_alternate_names;

static const mca_base_var_enum_value_t ompi_osc_rdma_locking_modes[] = {
{.value = OMPI_OSC_RDMA_LOCKING_TWO_LEVEL, .string = "two_level"},
Expand Down Expand Up @@ -266,14 +265,6 @@ static int ompi_osc_rdma_component_register (void)
MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_btl_names);
free(description_str);

ompi_osc_rdma_btl_alternate_names = "sm,tcp";
opal_asprintf(&description_str, "Comma-delimited list of alternate BTL component names to allow without verifying "
"connectivity (default: %s)", ompi_osc_rdma_btl_alternate_names);
(void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "alternate_btls", description_str,
MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3,
MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_btl_alternate_names);
free(description_str);

ompi_osc_rdma_mtl_names = "psm2";
opal_asprintf(&description_str, "Comma-delimited list of MTL component names to lower the priority of rdma "
"osc component (default: %s)", ompi_osc_rdma_mtl_names);
Expand Down Expand Up @@ -919,56 +910,66 @@ static void ompi_osc_rdma_ensure_local_add_procs (void)
* @return OMPI_SUCCESS if BTLs can be found
* @return OMPI_ERR_UNREACH if no BTLs can be found that match
*
* In this case an "alternate" BTL is a BTL that does not provide true RDMA but
* can use active messages using the BTL base AM RDMA/atomics. Since more than
* one BTL may be needed for this support the OSC component will disable the
* use of registration-based RDMA (these BTLs will not be used) and will use
* any remaining BTL. By default the BTLs used will be tcp and sm but any single
* (or pair) of BTLs may be used.
* This function is used when there ompi_osc_rdm_query_btls() failed to find
* a btl that can be used for all communication. In this case, osc/rdma will
* use mulitple btls for communications. One process can use different btl
* to communicate with different peer. Such btls are called "alternate btls".
*
* For an alternate btl, this function disabled its native implementation of
* RDMA and atomics, and made osc/rdma to always use the active message RDMA/atomics
* with the alternate btl.
*
* The reason to disable an alternate btl's native atomics is because
* When multiple alternate btls are being used, the atomicity accross btl's own
* atomics is not guaranteed. Therefore, osc/rdma must use active message atomics.
*
* The reason to disable an alternate btls' native RDMA put and get is because
* it signficantly simplified osc/rdma's completion. The simplication came in two
* areas:
*
* First, active message RDMA supports remote completion (when a btl's native
* RDMA is disabled). Remote completion is required by several key components
* of osc/rdma: the usage of cpu atomics to update peer's state, the usage
* of local leader to update peer's state, and its fence implementation.
* If osc/rdma do not use active message RDMA on alternate btls, it will
* have to keep track of each selected btl's support of remote completion.
* If any selected btl does not support remote completion, it will have to
* disable the usage of cpu atomics, disable the usage of local leader,
* and implement a different fence mechanism.
*
* Second, active message RDMA does not use memory registration explicitly,
* therefore using it eliminates the need to store and exchange multiple
* memory registrations.
*/
static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module)
{
mca_btl_base_selected_module_t *item;
char **btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ',');
int btls_found = 0;

btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ',');
if (NULL == btls_to_use) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "no alternate BTLs requested: %s", ompi_osc_rdma_btl_alternate_names);
return OMPI_ERR_UNREACH;
}

if (module) {
module->btls_in_use = 0;
}

/* rdma and atomics are only supported with BTLs at the moment */
for (int i = 0 ; btls_to_use[i] ; ++i) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "checking for btl %s", btls_to_use[i]);
OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) {
if (NULL != item->btl_module->btl_register_mem) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "skipping RDMA btl when searching for alternate BTL");
continue;
}

if (0 != strcmp (btls_to_use[i], item->btl_module->btl_component->btl_version.mca_component_name)) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "skipping btl %s",
item->btl_module->btl_component->btl_version.mca_component_name);
continue;
}
OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) {
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "found alternate btl %s",
item->btl_module->btl_component->btl_version.mca_component_name);

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "found alternate btl %s", btls_to_use[i]);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "disabing btl's native support of RDMA and ATOMIC");
item->btl_module->btl_flags &= ~(MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_ATOMIC_FOPS );

++btls_found;
if (module) {
mca_btl_base_am_rdma_init(item->btl_module);
ompi_osc_rdma_selected_btl_insert(module, item->btl_module, module->btls_in_use++);
}

++btls_found;
if (module) {
mca_btl_base_am_rdma_init(item->btl_module);
assert(item->btl_module->btl_flags & MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION);
ompi_osc_rdma_selected_btl_insert(module, item->btl_module, module->btls_in_use++);
}
}

opal_argv_free (btls_to_use);
/* active message RDMA/atomics does not require explicit memory registration */
if (module) {
module->use_memory_registration = false;
}

return btls_found > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH;
}
Expand Down Expand Up @@ -1003,7 +1004,7 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
}

if ((item->btl_module->btl_flags & (MCA_BTL_FLAGS_RDMA)) == MCA_BTL_FLAGS_RDMA &&
(item->btl_module->btl_flags & (MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS))) {
(item->btl_module->btl_flags & (MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS | MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION))) {
if (!selected_btl || item->btl_module->btl_latency < selected_btl->btl_latency) {
selected_btl = item->btl_module;
}
Expand Down Expand Up @@ -1072,10 +1073,14 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo
btl_counts = tmp;

for (int i_btl = 0 ; i_btl < num_btls ; ++i_btl) {
/* for this implementation we need only compare-and-swap and fetch-and-add */
/* for this implementation we need only compare-and-swap and fetch-and-add
*
* If a btl does not support remote completion, it cannot be used as the primary btl.
* It can still be selected as an alternate btl */
if ((endpoint->btl_rdma.bml_btls[i_btl].btl->btl_flags & (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS)) ==
(MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS) && (endpoint->btl_rdma.bml_btls[i_btl].btl->btl_atomic_flags &
MCA_BTL_ATOMIC_SUPPORTS_ADD)) {
MCA_BTL_ATOMIC_SUPPORTS_ADD) &&
(endpoint->btl_rdma.bml_btls[i_btl].btl->btl_flags & MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION)) {
for (int j = 0 ; j < max_btls ; ++j) {
if (endpoint->btl_rdma.bml_btls[i_btl].btl == possible_btls[j]) {
++btl_counts[j];
Expand Down

0 comments on commit b334866

Please sign in to comment.