From b33486618e05374d6239f1b0da682a02f659fe34 Mon Sep 17 00:00:00 2001 From: Wei Zhang Date: Sun, 14 Nov 2021 19:19:41 +0000 Subject: [PATCH] osc/rdma: adjustment on btl selection logic This patch makes the following adjustments to the btl selection logic. First, when selecting a primary btl, this patch requires the primary btl to support remote completion. Second, when selecting alternate btls, this patch allowes any btl to be used as an alternate btl. Prior to this patch, only a list of pre-defined btls can be used as alternate btls. Finally, when a btl is used as an atlernate btl, this patch disables its native atomic and RDMA support, and always use active message RDMA/atomics with it. The reason for these changes are: First, these changes ensured the selected btls of osc/rdma always support remote completion (because active message RDMA/atomics supports remote completion when an btl's native RDMA is disabled). Remote completion is essential for several key components of osc/rdma: the usage of cpu atomics to update peer's state, the usage of local leader to update peer's state and its fence implementation. Therefore the assurance of btl's support of remote completion greatly simplified osc/rdma's implementation. Second, these changes eliminated the need to save and exchange more than 1 memory reigstration, because active message RDMA/atomic does not require explicit memory registration. Third, these changes ensured the correctness of atomic operations. When multiple alternate btls are used, atomicity cannot be guarenteed accross each btl's native implementation of atomics. Finally, these changes allowed more btls to be used as alternate btls, especially btl/self. Signed-off-by: Wei Zhang --- ompi/mca/osc/rdma/osc_rdma_component.c | 97 ++++++++++++++------------ 1 file changed, 51 insertions(+), 46 deletions(-) diff --git a/ompi/mca/osc/rdma/osc_rdma_component.c b/ompi/mca/osc/rdma/osc_rdma_component.c index 12937ffc644..b914d84543c 100644 --- a/ompi/mca/osc/rdma/osc_rdma_component.c +++ b/ompi/mca/osc/rdma/osc_rdma_component.c @@ -85,7 +85,6 @@ static const char* ompi_osc_rdma_set_no_lock_info(opal_infosubscriber_t *obj, co static char *ompi_osc_rdma_btl_names; static char *ompi_osc_rdma_mtl_names; -static char *ompi_osc_rdma_btl_alternate_names; static const mca_base_var_enum_value_t ompi_osc_rdma_locking_modes[] = { {.value = OMPI_OSC_RDMA_LOCKING_TWO_LEVEL, .string = "two_level"}, @@ -266,14 +265,6 @@ static int ompi_osc_rdma_component_register (void) MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_btl_names); free(description_str); - ompi_osc_rdma_btl_alternate_names = "sm,tcp"; - opal_asprintf(&description_str, "Comma-delimited list of alternate BTL component names to allow without verifying " - "connectivity (default: %s)", ompi_osc_rdma_btl_alternate_names); - (void) mca_base_component_var_register (&mca_osc_rdma_component.super.osc_version, "alternate_btls", description_str, - MCA_BASE_VAR_TYPE_STRING, NULL, 0, 0, OPAL_INFO_LVL_3, - MCA_BASE_VAR_SCOPE_GROUP, &ompi_osc_rdma_btl_alternate_names); - free(description_str); - ompi_osc_rdma_mtl_names = "psm2"; opal_asprintf(&description_str, "Comma-delimited list of MTL component names to lower the priority of rdma " "osc component (default: %s)", ompi_osc_rdma_mtl_names); @@ -919,56 +910,66 @@ static void ompi_osc_rdma_ensure_local_add_procs (void) * @return OMPI_SUCCESS if BTLs can be found * @return OMPI_ERR_UNREACH if no BTLs can be found that match * - * In this case an "alternate" BTL is a BTL that does not provide true RDMA but - * can use active messages using the BTL base AM RDMA/atomics. Since more than - * one BTL may be needed for this support the OSC component will disable the - * use of registration-based RDMA (these BTLs will not be used) and will use - * any remaining BTL. By default the BTLs used will be tcp and sm but any single - * (or pair) of BTLs may be used. + * This function is used when there ompi_osc_rdm_query_btls() failed to find + * a btl that can be used for all communication. In this case, osc/rdma will + * use mulitple btls for communications. One process can use different btl + * to communicate with different peer. Such btls are called "alternate btls". + * + * For an alternate btl, this function disabled its native implementation of + * RDMA and atomics, and made osc/rdma to always use the active message RDMA/atomics + * with the alternate btl. + * + * The reason to disable an alternate btl's native atomics is because + * When multiple alternate btls are being used, the atomicity accross btl's own + * atomics is not guaranteed. Therefore, osc/rdma must use active message atomics. + * + * The reason to disable an alternate btls' native RDMA put and get is because + * it signficantly simplified osc/rdma's completion. The simplication came in two + * areas: + * + * First, active message RDMA supports remote completion (when a btl's native + * RDMA is disabled). Remote completion is required by several key components + * of osc/rdma: the usage of cpu atomics to update peer's state, the usage + * of local leader to update peer's state, and its fence implementation. + * If osc/rdma do not use active message RDMA on alternate btls, it will + * have to keep track of each selected btl's support of remote completion. + * If any selected btl does not support remote completion, it will have to + * disable the usage of cpu atomics, disable the usage of local leader, + * and implement a different fence mechanism. + * + * Second, active message RDMA does not use memory registration explicitly, + * therefore using it eliminates the need to store and exchange multiple + * memory registrations. */ static int ompi_osc_rdma_query_alternate_btls (ompi_communicator_t *comm, ompi_osc_rdma_module_t *module) { mca_btl_base_selected_module_t *item; - char **btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ','); int btls_found = 0; - btls_to_use = opal_argv_split (ompi_osc_rdma_btl_alternate_names, ','); - if (NULL == btls_to_use) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "no alternate BTLs requested: %s", ompi_osc_rdma_btl_alternate_names); - return OMPI_ERR_UNREACH; - } - if (module) { module->btls_in_use = 0; } /* rdma and atomics are only supported with BTLs at the moment */ - for (int i = 0 ; btls_to_use[i] ; ++i) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "checking for btl %s", btls_to_use[i]); - OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) { - if (NULL != item->btl_module->btl_register_mem) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "skipping RDMA btl when searching for alternate BTL"); - continue; - } - - if (0 != strcmp (btls_to_use[i], item->btl_module->btl_component->btl_version.mca_component_name)) { - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "skipping btl %s", - item->btl_module->btl_component->btl_version.mca_component_name); - continue; - } + OPAL_LIST_FOREACH(item, &mca_btl_base_modules_initialized, mca_btl_base_selected_module_t) { + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "found alternate btl %s", + item->btl_module->btl_component->btl_version.mca_component_name); - OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "found alternate btl %s", btls_to_use[i]); + OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "disabing btl's native support of RDMA and ATOMIC"); + item->btl_module->btl_flags &= ~(MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_GET | MCA_BTL_FLAGS_PUT | MCA_BTL_FLAGS_ATOMIC_FOPS ); - ++btls_found; - if (module) { - mca_btl_base_am_rdma_init(item->btl_module); - ompi_osc_rdma_selected_btl_insert(module, item->btl_module, module->btls_in_use++); - } - + ++btls_found; + if (module) { + mca_btl_base_am_rdma_init(item->btl_module); + assert(item->btl_module->btl_flags & MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION); + ompi_osc_rdma_selected_btl_insert(module, item->btl_module, module->btls_in_use++); } } - opal_argv_free (btls_to_use); + /* active message RDMA/atomics does not require explicit memory registration */ + if (module) { + module->use_memory_registration = false; + } return btls_found > 0 ? OMPI_SUCCESS : OMPI_ERR_UNREACH; } @@ -1003,7 +1004,7 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo } if ((item->btl_module->btl_flags & (MCA_BTL_FLAGS_RDMA)) == MCA_BTL_FLAGS_RDMA && - (item->btl_module->btl_flags & (MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS))) { + (item->btl_module->btl_flags & (MCA_BTL_FLAGS_ATOMIC_FOPS | MCA_BTL_FLAGS_ATOMIC_OPS | MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION))) { if (!selected_btl || item->btl_module->btl_latency < selected_btl->btl_latency) { selected_btl = item->btl_module; } @@ -1072,10 +1073,14 @@ static int ompi_osc_rdma_query_btls (ompi_communicator_t *comm, ompi_osc_rdma_mo btl_counts = tmp; for (int i_btl = 0 ; i_btl < num_btls ; ++i_btl) { - /* for this implementation we need only compare-and-swap and fetch-and-add */ + /* for this implementation we need only compare-and-swap and fetch-and-add + * + * If a btl does not support remote completion, it cannot be used as the primary btl. + * It can still be selected as an alternate btl */ if ((endpoint->btl_rdma.bml_btls[i_btl].btl->btl_flags & (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS)) == (MCA_BTL_FLAGS_RDMA | MCA_BTL_FLAGS_ATOMIC_FOPS) && (endpoint->btl_rdma.bml_btls[i_btl].btl->btl_atomic_flags & - MCA_BTL_ATOMIC_SUPPORTS_ADD)) { + MCA_BTL_ATOMIC_SUPPORTS_ADD) && + (endpoint->btl_rdma.bml_btls[i_btl].btl->btl_flags & MCA_BTL_FLAGS_RDMA_REMOTE_COMPLETION)) { for (int j = 0 ; j < max_btls ; ++j) { if (endpoint->btl_rdma.bml_btls[i_btl].btl == possible_btls[j]) { ++btl_counts[j];