From b2074d1030ffbd2e3c2b24cce61d86f523873d94 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Fri, 2 Feb 2024 11:18:26 -0600 Subject: [PATCH 01/17] misc: rename MPIR_gpu_req to MPIR_async_req MPIR_gpu_req is a union type for either a MPL_gpu_request or a MPIR_Typerep_req, thus it is not just for gpu. Potentially this type can be extended to include other internal async task handles. Thus we rename it to MPIR_async_req. We also establish the convention of naming the variable async_req. --- src/include/mpir_misc.h | 7 ++----- src/mpi/misc/utils.c | 31 ++++++++++++++-------------- src/mpid/ch4/netmod/ofi/ofi_events.c | 13 ++++++------ src/mpid/ch4/netmod/ofi/ofi_impl.h | 22 ++++++++++---------- src/mpid/ch4/netmod/ofi/ofi_types.h | 2 +- src/mpid/ch4/shm/ipc/gpu/gpu_post.c | 14 ++++++------- src/mpid/ch4/shm/ipc/gpu/gpu_post.h | 2 +- src/mpid/ch4/shm/ipc/src/ipc_p2p.h | 6 +++--- 8 files changed, 48 insertions(+), 49 deletions(-) diff --git a/src/include/mpir_misc.h b/src/include/mpir_misc.h index 6a46b13d11c..24237f50f3f 100644 --- a/src/include/mpir_misc.h +++ b/src/include/mpir_misc.h @@ -49,9 +49,6 @@ extern MPL_initlock_t MPIR_init_lock; #include "typerep_pre.h" /* needed for MPIR_Typerep_req */ -/* FIXME: bad names. Not gpu-specific, confusing with MPIR_Request. - * It's a general async handle. - */ typedef enum { MPIR_NULL_REQUEST = 0, MPIR_TYPEREP_REQUEST, @@ -64,7 +61,7 @@ typedef struct { MPL_gpu_request gpu_req; } u; MPIR_request_type_t type; -} MPIR_gpu_req; +} MPIR_async_req; int MPIR_Localcopy(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype sendtype, void *recvbuf, MPI_Aint recvcount, MPI_Datatype recvtype); @@ -82,7 +79,7 @@ int MPIR_Ilocalcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype se MPI_Aint sendoffset, MPL_pointer_attr_t * sendattr, void *recvbuf, MPI_Aint recvcount, MPI_Datatype recvtype, MPI_Aint recvoffset, MPL_pointer_attr_t * recvattr, MPL_gpu_copy_direction_t dir, - MPL_gpu_engine_type_t enginetype, bool commit, MPIR_gpu_req * req); + MPL_gpu_engine_type_t enginetype, bool commit, MPIR_async_req * req); /* Contiguous datatype calculates buffer address with `(char *) buf + dt_true_lb`. * However, dt_true_lb is treated as ptrdiff_t (signed), and when buf is MPI_BOTTOM diff --git a/src/mpi/misc/utils.c b/src/mpi/misc/utils.c index 88de7f1b40c..8be94576821 100644 --- a/src/mpi/misc/utils.c +++ b/src/mpi/misc/utils.c @@ -188,7 +188,8 @@ static int do_localcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatyp MPI_Aint sendoffset, MPL_pointer_attr_t * send_attr, void *recvbuf, MPI_Aint recvcount, MPI_Datatype recvtype, MPI_Aint recvoffset, MPL_pointer_attr_t * recv_attr, MPL_gpu_copy_direction_t dir, - MPL_gpu_engine_type_t enginetype, bool commit, MPIR_gpu_req * gpu_req) + MPL_gpu_engine_type_t enginetype, bool commit, + MPIR_async_req * async_req) { int mpi_errno = MPI_SUCCESS; int mpl_errno = MPL_SUCCESS; @@ -200,8 +201,8 @@ static int do_localcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatyp MPIR_FUNC_ENTER; - if (gpu_req) - gpu_req->type = MPIR_NULL_REQUEST; + if (async_req) + async_req->type = MPIR_NULL_REQUEST; MPIR_Datatype_get_size_macro(sendtype, sendsize); MPIR_Datatype_get_size_macro(recvtype, recvsize); @@ -260,7 +261,7 @@ static int do_localcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatyp MPIR_ERR_CHKANDJUMP(dev_id == -1, mpi_errno, MPI_ERR_OTHER, "**mpl_gpu_get_dev_id_from_attr"); - if (gpu_req == NULL) { + if (async_req == NULL) { MPL_gpu_request req; mpl_errno = MPL_gpu_imemcpy((char *) MPIR_get_contig_ptr(recvbuf, recvtype_true_lb) + @@ -281,8 +282,8 @@ static int do_localcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatyp recvoffset, (char *) MPIR_get_contig_ptr(sendbuf, sendtype_true_lb) + sendoffset, copy_sz, dev_id, dir, enginetype, - &gpu_req->u.gpu_req, commit); - gpu_req->type = MPIR_GPU_REQUEST; + &async_req->u.gpu_req, commit); + async_req->type = MPIR_GPU_REQUEST; } } #else @@ -300,15 +301,15 @@ static int do_localcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatyp fn_fail: goto fn_exit; fn_fallback: - if (gpu_req) { + if (async_req) { mpi_errno = do_localcopy(sendbuf, sendcount, sendtype, sendoffset, recvbuf, recvcount, recvtype, - recvoffset, LOCALCOPY_NONBLOCKING, &gpu_req->u.y_req); + recvoffset, LOCALCOPY_NONBLOCKING, &async_req->u.y_req); MPIR_ERR_CHECK(mpi_errno); - if (gpu_req->u.y_req.req == MPIR_TYPEREP_REQ_NULL) { - gpu_req->type = MPIR_NULL_REQUEST; + if (async_req->u.y_req.req == MPIR_TYPEREP_REQ_NULL) { + async_req->type = MPIR_NULL_REQUEST; } else { - gpu_req->type = MPIR_TYPEREP_REQUEST; + async_req->type = MPIR_TYPEREP_REQUEST; } } else { mpi_errno = @@ -414,7 +415,7 @@ int MPIR_Ilocalcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype se MPI_Aint sendoffset, MPL_pointer_attr_t * sendattr, void *recvbuf, MPI_Aint recvcount, MPI_Datatype recvtype, MPI_Aint recvoffset, MPL_pointer_attr_t * recvattr, MPL_gpu_copy_direction_t dir, - MPL_gpu_engine_type_t enginetype, bool commit, MPIR_gpu_req * req) + MPL_gpu_engine_type_t enginetype, bool commit, MPIR_async_req * async_req) { int mpi_errno = MPI_SUCCESS; @@ -423,14 +424,14 @@ int MPIR_Ilocalcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype se #ifdef MPL_HAVE_GPU mpi_errno = do_localcopy_gpu(sendbuf, sendcount, sendtype, sendoffset, sendattr, recvbuf, recvcount, - recvtype, recvoffset, recvattr, dir, enginetype, commit, req); + recvtype, recvoffset, recvattr, dir, enginetype, commit, async_req); MPIR_ERR_CHECK(mpi_errno); #else mpi_errno = do_localcopy(sendbuf, sendcount, sendtype, sendoffset, recvbuf, recvcount, recvtype, - recvoffset, LOCALCOPY_NONBLOCKING, &req->u.y_req); + recvoffset, LOCALCOPY_NONBLOCKING, &async_req->u.y_req); MPIR_ERR_CHECK(mpi_errno); - req->type = MPIR_TYPEREP_REQUEST; + async_req->type = MPIR_TYPEREP_REQUEST; #endif fn_exit: diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.c b/src/mpid/ch4/netmod/ofi/ofi_events.c index 84046946d11..e6ca3921712 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_events.c +++ b/src/mpid/ch4/netmod/ofi/ofi_events.c @@ -148,15 +148,16 @@ static int pipeline_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r, MPIR_Assert(n_chunks == 0); /* First chunk arrives. */ MPI_Aint actual_unpack_bytes; - MPIR_gpu_req yreq; + MPIR_async_req async_req; mpi_errno = MPIR_Ilocalcopy_gpu(wc_buf, wc->len, MPI_BYTE, 0, NULL, recv_buf, recv_count, - datatype, 0, NULL, MPL_GPU_COPY_H2D, engine_type, 1, &yreq); + datatype, 0, NULL, MPL_GPU_COPY_H2D, engine_type, 1, + &async_req); MPIR_ERR_CHECK(mpi_errno); actual_unpack_bytes = wc->len; task = MPIDI_OFI_create_gpu_task(MPIDI_OFI_PIPELINE_RECV, wc_buf, - actual_unpack_bytes, rreq, yreq); + actual_unpack_bytes, rreq, async_req); DL_APPEND(MPIDI_OFI_global.gpu_recv_task_queue[vci_local], task); MPIDI_OFI_REQUEST(rreq, pipeline_info.offset) += (size_t) actual_unpack_bytes; } else { @@ -214,18 +215,18 @@ static int pipeline_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r, if (likely(event_id == MPIDI_OFI_EVENT_RECV_GPU_PIPELINE)) { /* FIXME: current design unpacks all bytes from host buffer, overflow check is missing. */ MPI_Aint actual_unpack_bytes; - MPIR_gpu_req yreq; + MPIR_async_req async_req; mpi_errno = MPIR_Ilocalcopy_gpu(wc_buf, (MPI_Aint) wc->len, MPI_BYTE, 0, NULL, (char *) recv_buf, (MPI_Aint) recv_count, datatype, MPIDI_OFI_REQUEST(rreq, pipeline_info.offset), NULL, - MPL_GPU_COPY_H2D, engine_type, 1, &yreq); + MPL_GPU_COPY_H2D, engine_type, 1, &async_req); MPIR_ERR_CHECK(mpi_errno); actual_unpack_bytes = wc->len; MPIDI_OFI_REQUEST(rreq, pipeline_info.offset) += (size_t) actual_unpack_bytes; task = MPIDI_OFI_create_gpu_task(MPIDI_OFI_PIPELINE_RECV, wc_buf, actual_unpack_bytes, - rreq, yreq); + rreq, async_req); DL_APPEND(MPIDI_OFI_global.gpu_recv_task_queue[vci_local], task); } else { MPIR_ERR_CHKANDJUMP(true, mpi_errno, MPI_ERR_OTHER, "**gpu_pipeline_packed"); diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h index 493980377f5..d0166c5ef18 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_impl.h +++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h @@ -595,7 +595,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_multx_receiver_nic_index(MPIR_Comm * comm return nic_idx; } -/* cq bufferring routines -- +/* cq buffering routines -- * in particular, when we encounter EAGAIN error during progress, such as during * active message handling, recursively calling progress may result in unpredictable * behaviors (e.g. stack overflow). Thus we need use the cq buffering to avoid @@ -840,7 +840,7 @@ MPL_STATIC_INLINE_PREFIX MPIDI_OFI_gpu_task_t *MPIDI_OFI_create_gpu_task(MPIDI_O type, void *buf, size_t len, MPIR_Request * request, - MPIR_gpu_req yreq) + MPIR_async_req async_req) { MPIDI_OFI_gpu_task_t *task = (MPIDI_OFI_gpu_task_t *) MPL_malloc(sizeof(MPIDI_OFI_gpu_task_t), MPL_MEM_OTHER); @@ -850,7 +850,7 @@ MPL_STATIC_INLINE_PREFIX MPIDI_OFI_gpu_task_t *MPIDI_OFI_create_gpu_task(MPIDI_O task->buf = buf; task->len = len; task->request = request; - task->yreq = yreq; + task->areq = async_req; task->prev = NULL; task->next = NULL; return task; @@ -922,7 +922,7 @@ static int MPIDI_OFI_gpu_progress_send(void) goto fn_exit; } MPI_Aint actual_pack_bytes; - MPIR_gpu_req yreq; + MPIR_async_req async_req; int commit = send_task->left_sz <= chunk_sz ? 1 : 0; if (!commit && !MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST && @@ -933,12 +933,12 @@ static int MPIDI_OFI_gpu_progress_send(void) MPIR_Ilocalcopy_gpu((char *) send_task->send_buf, send_task->count, datatype, send_task->offset, &send_task->attr, host_buf, chunk_sz, MPI_BYTE, 0, NULL, MPL_GPU_COPY_D2H, engine_type, - commit, &yreq); + commit, &async_req); MPIR_ERR_CHECK(mpi_errno); actual_pack_bytes = chunk_sz; task = MPIDI_OFI_create_gpu_task(MPIDI_OFI_PIPELINE_SEND, host_buf, actual_pack_bytes, - send_task->sreq, yreq); + send_task->sreq, async_req); send_task->offset += (size_t) actual_pack_bytes; send_task->left_sz -= (size_t) actual_pack_bytes; vci_local = MPIDI_OFI_REQUEST(send_task->sreq, pipeline_info.vci_local); @@ -1027,13 +1027,13 @@ static int MPIDI_OFI_gpu_progress_task(MPIDI_OFI_gpu_task_t * gpu_queue[], int v goto fn_exit; } - MPIR_gpu_req *yreq = &task->yreq; + MPIR_async_req *areq = &task->async_req; int completed = 0; - if (yreq->type == MPIR_GPU_REQUEST) { - mpi_errno = MPL_gpu_test(&yreq->u.gpu_req, &completed); + if (areq->type == MPIR_GPU_REQUEST) { + mpi_errno = MPL_gpu_test(&areq->u.gpu_req, &completed); MPIR_ERR_CHECK(mpi_errno); - } else if (yreq->type == MPIR_TYPEREP_REQUEST) { - MPIR_Typerep_test(yreq->u.y_req, &completed); + } else if (areq->type == MPIR_TYPEREP_REQUEST) { + MPIR_Typerep_test(areq->u.y_req, &completed); } else { completed = 1; } diff --git a/src/mpid/ch4/netmod/ofi/ofi_types.h b/src/mpid/ch4/netmod/ofi/ofi_types.h index fe32de1918f..77dbcf8a90c 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_types.h +++ b/src/mpid/ch4/netmod/ofi/ofi_types.h @@ -352,7 +352,7 @@ typedef struct MPIDI_OFI_gpu_task { void *buf; size_t len; MPIR_Request *request; - MPIR_gpu_req yreq; + MPIR_async_req async_req; struct MPIDI_OFI_gpu_task *next, *prev; } MPIDI_OFI_gpu_task_t; diff --git a/src/mpid/ch4/shm/ipc/gpu/gpu_post.c b/src/mpid/ch4/shm/ipc/gpu/gpu_post.c index 0ba044992a4..5776b120305 100644 --- a/src/mpid/ch4/shm/ipc/gpu/gpu_post.c +++ b/src/mpid/ch4/shm/ipc/gpu/gpu_post.c @@ -453,7 +453,7 @@ int MPIDI_GPU_ipc_fast_memcpy(MPIDI_IPCI_ipc_handle_t ipc_handle, void *dest_vad struct gpu_ipc_async { MPIR_Request *rreq; /* async handle */ - MPIR_gpu_req yreq; + MPIR_async_req async_req; /* for unmap */ void *src_buf; MPIDI_GPU_ipc_handle_t gpu_handle; @@ -465,16 +465,16 @@ static int gpu_ipc_async_poll(MPIR_Async_thing * thing) int is_done = 0; struct gpu_ipc_async *p = MPIR_Async_thing_get_state(thing); - switch (p->yreq.type) { + switch (p->async_req.type) { case MPIR_NULL_REQUEST: /* a dummy, immediately complete */ is_done = 1; break; case MPIR_TYPEREP_REQUEST: - MPIR_Typerep_test(p->yreq.u.y_req, &is_done); + MPIR_Typerep_test(p->async_req.u.y_req, &is_done); break; case MPIR_GPU_REQUEST: - err = MPL_gpu_test(&p->yreq.u.gpu_req, &is_done); + err = MPL_gpu_test(&p->async_req.u.gpu_req, &is_done); MPIR_Assertp(err == MPL_SUCCESS); break; default: @@ -498,7 +498,7 @@ static int gpu_ipc_async_poll(MPIR_Async_thing * thing) return MPIR_ASYNC_THING_NOPROGRESS; } -int MPIDI_GPU_ipc_async_start(MPIR_Request * rreq, MPIR_gpu_req * req_p, +int MPIDI_GPU_ipc_async_start(MPIR_Request * rreq, MPIR_async_req * req_p, void *src_buf, MPIDI_GPU_ipc_handle_t gpu_handle) { int mpi_errno = MPI_SUCCESS; @@ -509,9 +509,9 @@ int MPIDI_GPU_ipc_async_start(MPIR_Request * rreq, MPIR_gpu_req * req_p, p->src_buf = src_buf; p->gpu_handle = gpu_handle; if (req_p) { - p->yreq = *req_p; + p->async_req = *req_p; } else { - p->yreq.type = MPIR_NULL_REQUEST; + p->async_req.type = MPIR_NULL_REQUEST; } mpi_errno = MPIR_Async_things_add(gpu_ipc_async_poll, p); diff --git a/src/mpid/ch4/shm/ipc/gpu/gpu_post.h b/src/mpid/ch4/shm/ipc/gpu/gpu_post.h index 8a113cb5e03..78285de80bd 100644 --- a/src/mpid/ch4/shm/ipc/gpu/gpu_post.h +++ b/src/mpid/ch4/shm/ipc/gpu/gpu_post.h @@ -25,7 +25,7 @@ typedef struct { int max_subdev_id; } MPIDI_GPU_device_info_t; -int MPIDI_GPU_ipc_async_start(MPIR_Request * rreq, MPIR_gpu_req * req_p, +int MPIDI_GPU_ipc_async_start(MPIR_Request * rreq, MPIR_async_req * req_p, void *src_buf, MPIDI_GPU_ipc_handle_t gpu_handle); #endif /* GPU_POST_H_INCLUDED */ diff --git a/src/mpid/ch4/shm/ipc/src/ipc_p2p.h b/src/mpid/ch4/shm/ipc/src/ipc_p2p.h index c658f4e1a09..45f88ff7a5a 100644 --- a/src/mpid/ch4/shm/ipc/src/ipc_p2p.h +++ b/src/mpid/ch4/shm/ipc/src/ipc_p2p.h @@ -246,19 +246,19 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_IPCI_handle_lmt_recv(MPIDI_IPC_hdr * ipc_hdr, src_count = ipc_hdr->count; src_dt = src_dt_ptr->handle; } - MPIR_gpu_req yreq; + MPIR_async_req async_req; MPL_gpu_engine_type_t engine = MPIDI_IPCI_choose_engine(ipc_hdr->ipc_handle.gpu.global_dev_id, dev_id); mpi_errno = MPIR_Ilocalcopy_gpu(src_buf, src_count, src_dt, 0, NULL, MPIDIG_REQUEST(rreq, buffer), MPIDIG_REQUEST(rreq, count), MPIDIG_REQUEST(rreq, datatype), 0, &attr, - MPL_GPU_COPY_DIRECTION_NONE, engine, true, &yreq); + MPL_GPU_COPY_DIRECTION_NONE, engine, true, &async_req); MPIR_ERR_CHECK(mpi_errno); if (src_dt_ptr) { MPIR_Datatype_free(src_dt_ptr); } - mpi_errno = MPIDI_GPU_ipc_async_start(rreq, &yreq, src_buf, ipc_hdr->ipc_handle.gpu); + mpi_errno = MPIDI_GPU_ipc_async_start(rreq, &async_req, src_buf, ipc_hdr->ipc_handle.gpu); MPIR_ERR_CHECK(mpi_errno); goto fn_exit; } else if (ipc_hdr->ipc_type == MPIDI_IPCI_TYPE__NONE) { From 20464b644ccff0f0941ca4234213763d9712efe0 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 31 Jan 2024 10:28:34 -0600 Subject: [PATCH 02/17] misc: add MPIR_async_test Add an inline wrapper for testing MPIR_async_req. Modify the order of header inclusion due to the dependency on typerep_pre.h. --- src/include/mpiimpl.h | 2 +- src/include/mpir_misc.h | 20 ++++++++++++++++++++ src/include/mpir_typerep.h | 2 -- src/mpi/datatype/typerep/src/typerep_pre.h | 3 +++ 4 files changed, 24 insertions(+), 3 deletions(-) diff --git a/src/include/mpiimpl.h b/src/include/mpiimpl.h index 35e5484132a..c21894d2ba3 100644 --- a/src/include/mpiimpl.h +++ b/src/include/mpiimpl.h @@ -155,7 +155,6 @@ typedef struct MPIR_Stream MPIR_Stream; /******************* PART 3: DEVICE INDEPENDENT HEADERS **********************/ /*****************************************************************************/ -#include "mpir_misc.h" #include "mpir_dbg.h" #include "mpir_objects.h" #include "mpir_strerror.h" @@ -166,6 +165,7 @@ typedef struct MPIR_Stream MPIR_Stream; #include "mpir_mem.h" #include "mpir_info.h" #include "mpir_errcodes.h" +#include "mpir_misc.h" #include "mpir_errhandler.h" #include "mpir_attr_generic.h" #include "mpir_contextid.h" diff --git a/src/include/mpir_misc.h b/src/include/mpir_misc.h index 24237f50f3f..2a0c1277c13 100644 --- a/src/include/mpir_misc.h +++ b/src/include/mpir_misc.h @@ -63,6 +63,26 @@ typedef struct { MPIR_request_type_t type; } MPIR_async_req; +MPL_STATIC_INLINE_PREFIX void MPIR_async_test(MPIR_async_req * areq, int *is_done) +{ + int err; + switch (areq->type) { + case MPIR_NULL_REQUEST: + /* a dummy, immediately complete */ + *is_done = 1; + break; + case MPIR_TYPEREP_REQUEST: + MPIR_Typerep_test(areq->u.y_req, is_done); + break; + case MPIR_GPU_REQUEST: + err = MPL_gpu_test(&areq->u.gpu_req, is_done); + MPIR_Assertp(err == MPL_SUCCESS); + break; + default: + MPIR_Assert(0); + } +} + int MPIR_Localcopy(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype sendtype, void *recvbuf, MPI_Aint recvcount, MPI_Datatype recvtype); int MPIR_Ilocalcopy(const void *sendbuf, MPI_Aint sendcount, MPI_Datatype sendtype, diff --git a/src/include/mpir_typerep.h b/src/include/mpir_typerep.h index 536a4502c77..81d9ec48cab 100644 --- a/src/include/mpir_typerep.h +++ b/src/include/mpir_typerep.h @@ -79,8 +79,6 @@ int MPIR_Typerep_ipack(const void *inbuf, MPI_Aint incount, MPI_Datatype datatyp int MPIR_Typerep_iunpack(const void *inbuf, MPI_Aint insize, void *outbuf, MPI_Aint outcount, MPI_Datatype datatype, MPI_Aint outoffset, MPI_Aint * actual_unpack_bytes, MPIR_Typerep_req * typerep_req, uint32_t flags); -int MPIR_Typerep_wait(MPIR_Typerep_req typerep_req); -int MPIR_Typerep_test(MPIR_Typerep_req typerep_req, int *completed); int MPIR_Typerep_size_external32(MPI_Datatype type); int MPIR_Typerep_pack_external(const void *inbuf, MPI_Aint incount, MPI_Datatype datatype, diff --git a/src/mpi/datatype/typerep/src/typerep_pre.h b/src/mpi/datatype/typerep/src/typerep_pre.h index 022510fbe2c..347bed20a41 100644 --- a/src/mpi/datatype/typerep/src/typerep_pre.h +++ b/src/mpi/datatype/typerep/src/typerep_pre.h @@ -28,4 +28,7 @@ typedef struct { #define MPIR_TYPEREP_HANDLE_NULL NULL #endif +int MPIR_Typerep_wait(MPIR_Typerep_req typerep_req); +int MPIR_Typerep_test(MPIR_Typerep_req typerep_req, int *completed); + #endif /* TYPEREP_PRE_H_INCLUDED */ From e0e64eebf2451547e65b2820ea0bb5666b91f76b Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 31 Jan 2024 10:30:33 -0600 Subject: [PATCH 03/17] ch4/ipc: refactor gpu_ipc_async_poll to use MPIR_async_test --- src/mpid/ch4/shm/ipc/gpu/gpu_post.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/src/mpid/ch4/shm/ipc/gpu/gpu_post.c b/src/mpid/ch4/shm/ipc/gpu/gpu_post.c index 5776b120305..538cfdfb46a 100644 --- a/src/mpid/ch4/shm/ipc/gpu/gpu_post.c +++ b/src/mpid/ch4/shm/ipc/gpu/gpu_post.c @@ -465,21 +465,7 @@ static int gpu_ipc_async_poll(MPIR_Async_thing * thing) int is_done = 0; struct gpu_ipc_async *p = MPIR_Async_thing_get_state(thing); - switch (p->async_req.type) { - case MPIR_NULL_REQUEST: - /* a dummy, immediately complete */ - is_done = 1; - break; - case MPIR_TYPEREP_REQUEST: - MPIR_Typerep_test(p->async_req.u.y_req, &is_done); - break; - case MPIR_GPU_REQUEST: - err = MPL_gpu_test(&p->async_req.u.gpu_req, &is_done); - MPIR_Assertp(err == MPL_SUCCESS); - break; - default: - MPIR_Assert(0); - } + MPIR_async_test(&(p->async_req), &is_done); if (is_done) { int vci = MPIDIG_REQUEST(p->rreq, req->local_vci); From 010f231a6a85d768de62a282c71d4b2c4f36ccc1 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 31 Jan 2024 10:59:53 -0600 Subject: [PATCH 04/17] ch4/ofi: refactor pipeline recv async copy Refactor the async copy in receive events using MPIR_async facilities. --- src/mpid/ch4/netmod/ofi/Makefile.mk | 1 + src/mpid/ch4/netmod/ofi/ofi_events.c | 35 +------ src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c | 113 +++++++++++++++++++++ src/mpid/ch4/netmod/ofi/ofi_impl.h | 50 +-------- 4 files changed, 123 insertions(+), 76 deletions(-) create mode 100644 src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c diff --git a/src/mpid/ch4/netmod/ofi/Makefile.mk b/src/mpid/ch4/netmod/ofi/Makefile.mk index 0ccce9d6181..01addc1021c 100644 --- a/src/mpid/ch4/netmod/ofi/Makefile.mk +++ b/src/mpid/ch4/netmod/ofi/Makefile.mk @@ -21,6 +21,7 @@ mpi_core_sources += src/mpid/ch4/netmod/ofi/func_table.c \ src/mpid/ch4/netmod/ofi/ofi_progress.c \ src/mpid/ch4/netmod/ofi/ofi_am_events.c \ src/mpid/ch4/netmod/ofi/ofi_nic.c \ + src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c \ src/mpid/ch4/netmod/ofi/globals.c \ src/mpid/ch4/netmod/ofi/init_provider.c \ src/mpid/ch4/netmod/ofi/init_settings.c \ diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.c b/src/mpid/ch4/netmod/ofi/ofi_events.c index e6ca3921712..ba70becf599 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_events.c +++ b/src/mpid/ch4/netmod/ofi/ofi_events.c @@ -110,13 +110,11 @@ static int pipeline_send_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r) static int pipeline_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r, int event_id) { int mpi_errno = MPI_SUCCESS; - int vci_local, i; + int i; MPIDI_OFI_gpu_pipeline_request *req; MPIR_Request *rreq; void *wc_buf = NULL; int in_use MPL_UNUSED; - MPIDI_OFI_gpu_task_t *task = NULL; - int engine_type = MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE; MPIR_FUNC_ENTER; @@ -130,7 +128,6 @@ static int pipeline_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r, MPI_Datatype datatype = MPIDI_OFI_REQUEST(rreq, noncontig.pack.datatype); fi_addr_t remote_addr = MPIDI_OFI_REQUEST(rreq, pipeline_info.remote_addr); - vci_local = MPIDI_OFI_REQUEST(rreq, pipeline_info.vci_local); if (event_id == MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT) { rreq->status.MPI_SOURCE = MPIDI_OFI_cqe_get_source(wc, true); @@ -147,19 +144,9 @@ static int pipeline_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r, if (wc->len > 0) { MPIR_Assert(n_chunks == 0); /* First chunk arrives. */ - MPI_Aint actual_unpack_bytes; - MPIR_async_req async_req; - mpi_errno = - MPIR_Ilocalcopy_gpu(wc_buf, wc->len, MPI_BYTE, 0, NULL, recv_buf, recv_count, - datatype, 0, NULL, MPL_GPU_COPY_H2D, engine_type, 1, - &async_req); + mpi_errno = MPIDI_OFI_gpu_pipeline_recv_copy(rreq, wc_buf, wc->len, + recv_buf, recv_count, datatype); MPIR_ERR_CHECK(mpi_errno); - actual_unpack_bytes = wc->len; - task = - MPIDI_OFI_create_gpu_task(MPIDI_OFI_PIPELINE_RECV, wc_buf, - actual_unpack_bytes, rreq, async_req); - DL_APPEND(MPIDI_OFI_global.gpu_recv_task_queue[vci_local], task); - MPIDI_OFI_REQUEST(rreq, pipeline_info.offset) += (size_t) actual_unpack_bytes; } else { /* free this chunk */ MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, wc_buf); @@ -213,21 +200,9 @@ static int pipeline_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r, } } else { if (likely(event_id == MPIDI_OFI_EVENT_RECV_GPU_PIPELINE)) { - /* FIXME: current design unpacks all bytes from host buffer, overflow check is missing. */ - MPI_Aint actual_unpack_bytes; - MPIR_async_req async_req; - mpi_errno = - MPIR_Ilocalcopy_gpu(wc_buf, (MPI_Aint) wc->len, MPI_BYTE, 0, NULL, - (char *) recv_buf, (MPI_Aint) recv_count, datatype, - MPIDI_OFI_REQUEST(rreq, pipeline_info.offset), NULL, - MPL_GPU_COPY_H2D, engine_type, 1, &async_req); + mpi_errno = MPIDI_OFI_gpu_pipeline_recv_copy(rreq, wc_buf, wc->len, + recv_buf, recv_count, datatype); MPIR_ERR_CHECK(mpi_errno); - actual_unpack_bytes = wc->len; - MPIDI_OFI_REQUEST(rreq, pipeline_info.offset) += (size_t) actual_unpack_bytes; - task = - MPIDI_OFI_create_gpu_task(MPIDI_OFI_PIPELINE_RECV, wc_buf, actual_unpack_bytes, - rreq, async_req); - DL_APPEND(MPIDI_OFI_global.gpu_recv_task_queue[vci_local], task); } else { MPIR_ERR_CHKANDJUMP(true, mpi_errno, MPI_ERR_OTHER, "**gpu_pipeline_packed"); } diff --git a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c new file mode 100644 index 00000000000..37d1c9862ac --- /dev/null +++ b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c @@ -0,0 +1,113 @@ +/* + * Copyright (C) by Argonne National Laboratory + * See COPYRIGHT in top-level directory + */ + +#include "mpidimpl.h" +#include "mpir_async_things.h" + +/* ------------------------------------ + * recv_copy: async copy from host_buf to user buffer in recv event + */ +struct recv_copy { + MPIR_Request *rreq; + /* async handle */ + MPIR_async_req async_req; + /* for cleanups */ + void *buf; +}; + +static int recv_copy_poll(MPIR_Async_thing * thing); +static void recv_copy_complete(MPIR_Request * rreq, void *buf); + +int MPIDI_OFI_gpu_pipeline_recv_copy(MPIR_Request * rreq, void *buf, MPI_Aint chunk_sz, + void *recv_buf, MPI_Aint count, MPI_Datatype datatype) +{ + int mpi_errno = MPI_SUCCESS; + + MPI_Aint offset = MPIDI_OFI_REQUEST(rreq, pipeline_info.offset); + int engine_type = MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE; + + /* FIXME: current design unpacks all bytes from host buffer, overflow check is missing. */ + MPIR_async_req async_req; + mpi_errno = MPIR_Ilocalcopy_gpu(buf, chunk_sz, MPI_BYTE, 0, NULL, + recv_buf, count, datatype, offset, NULL, + MPL_GPU_COPY_H2D, engine_type, 1, &async_req); + MPIR_ERR_CHECK(mpi_errno); + + MPIDI_OFI_REQUEST(rreq, pipeline_info.offset) += chunk_sz; + + struct recv_copy *p; + p = MPL_malloc(sizeof(*p), MPL_MEM_OTHER); + MPIR_Assert(p); + + p->rreq = rreq; + p->async_req = async_req; + p->buf = buf; + + mpi_errno = MPIR_Async_things_add(recv_copy_poll, p); + + return mpi_errno; +} + +static int recv_copy_poll(MPIR_Async_thing * thing) +{ + int is_done = 0; + + struct recv_copy *p = MPIR_Async_thing_get_state(thing); + MPIR_async_test(&(p->async_req), &is_done); + + if (is_done) { + recv_copy_complete(p->rreq, p->buf); + MPL_free(p); + return MPIR_ASYNC_THING_DONE; + } + + return MPIR_ASYNC_THING_NOPROGRESS; +} + +static void recv_copy_complete(MPIR_Request * rreq, void *buf) +{ + int mpi_errno = MPI_SUCCESS; + int c; + MPIR_cc_decr(rreq->cc_ptr, &c); + if (c == 0) { + /* all chunks arrived and copied */ + if (unlikely(MPIDI_OFI_REQUEST(rreq, pipeline_info.is_sync))) { + MPIR_Comm *comm = rreq->comm; + uint64_t ss_bits = + MPIDI_OFI_init_sendtag(MPL_atomic_relaxed_load_int + (&MPIDI_OFI_REQUEST(rreq, util_id)), + MPIR_Comm_rank(comm), rreq->status.MPI_TAG, + MPIDI_OFI_SYNC_SEND_ACK); + int r = rreq->status.MPI_SOURCE; + int vci_src = MPIDI_get_vci(SRC_VCI_FROM_RECVER, comm, r, comm->rank, + rreq->status.MPI_TAG); + int vci_dst = MPIDI_get_vci(DST_VCI_FROM_RECVER, comm, r, comm->rank, + rreq->status.MPI_TAG); + int vci_local = vci_dst; + int vci_remote = vci_src; + int nic = 0; + int ctx_idx = MPIDI_OFI_get_ctx_index(vci_local, nic); + fi_addr_t dest_addr = MPIDI_OFI_comm_to_phys(comm, r, nic, vci_remote); + MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(vci_local).lock); + MPIDI_OFI_CALL_RETRY(fi_tinjectdata(MPIDI_OFI_global.ctx[ctx_idx].tx, NULL /* buf */ , + 0 /* len */ , + MPIR_Comm_rank(comm), dest_addr, ss_bits), + vci_local, tinjectdata); + MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(vci_local).lock); + } + + MPIR_Datatype_release_if_not_builtin(MPIDI_OFI_REQUEST(rreq, datatype)); + /* Set number of bytes in status. */ + MPIR_STATUS_SET_COUNT(rreq->status, MPIDI_OFI_REQUEST(rreq, pipeline_info.offset)); + + MPIR_Request_free(rreq); + } + + /* Free host buffer, yaksa request and task. */ + MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, buf); + return; + fn_fail: + MPIR_Assertp(0); +} diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h index d0166c5ef18..c15eedf4e5f 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_impl.h +++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h @@ -827,6 +827,9 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_free_pack_buffer(void *ptr) } } +int MPIDI_OFI_gpu_pipeline_recv_copy(MPIR_Request * rreq, void *buf, MPI_Aint chunk_sz, + void *recv_buf, MPI_Aint count, MPI_Datatype datatype); + MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_pipeline_chunk_size(size_t data_sz) { int chunk_size = MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ; @@ -1063,52 +1066,7 @@ static int MPIDI_OFI_gpu_progress_task(MPIDI_OFI_gpu_task_t * gpu_queue[], int v DL_DELETE(gpu_queue[vni], task); MPL_free(task); } else { - MPIR_Assert(task->type == MPIDI_OFI_PIPELINE_RECV); - int c; - MPIR_cc_decr(request->cc_ptr, &c); - if (c == 0) { - /* If synchronous, ack and complete when the ack is done */ - if (unlikely(MPIDI_OFI_REQUEST(request, pipeline_info.is_sync))) { - MPIR_Comm *comm = request->comm; - uint64_t ss_bits = - MPIDI_OFI_init_sendtag(MPL_atomic_relaxed_load_int - (&MPIDI_OFI_REQUEST(request, util_id)), - MPIR_Comm_rank(comm), request->status.MPI_TAG, - MPIDI_OFI_SYNC_SEND_ACK); - int r = request->status.MPI_SOURCE; - int vci_src = MPIDI_get_vci(SRC_VCI_FROM_RECVER, comm, r, comm->rank, - request->status.MPI_TAG); - int vci_dst = MPIDI_get_vci(DST_VCI_FROM_RECVER, comm, r, comm->rank, - request->status.MPI_TAG); - int vci_local = vci_dst; - int vci_remote = vci_src; - int nic = 0; - int ctx_idx = MPIDI_OFI_get_ctx_index(vci_local, nic); - fi_addr_t dest_addr = MPIDI_OFI_comm_to_phys(comm, r, nic, vci_remote); - MPIDI_OFI_CALL_RETRY(fi_tinjectdata - (MPIDI_OFI_global.ctx[ctx_idx].tx, NULL /* buf */ , - 0 /* len */ , - MPIR_Comm_rank(comm), dest_addr, ss_bits), - vci_local, tinjectdata); - } - - MPIR_Datatype_release_if_not_builtin(MPIDI_OFI_REQUEST(request, datatype)); - /* Set number of bytes in status. */ - MPIR_STATUS_SET_COUNT(request->status, - MPIDI_OFI_REQUEST(request, pipeline_info.offset)); - - MPIR_Request_free(request); - } - - /* For recv, now task can be deleted from DL. */ - DL_DELETE(gpu_queue[vni], task); - /* Free host buffer, yaksa request and task. */ - if (task->type == MPIDI_OFI_PIPELINE_RECV) - MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, - task->buf); - else - MPIDI_OFI_gpu_free_pack_buffer(task->buf); - MPL_free(task); + MPIR_Assert(0); } } else { goto fn_exit; From 79f020ff7bed73f3888e3236d4dd0b96ac397867 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 31 Jan 2024 12:26:15 -0600 Subject: [PATCH 05/17] ch4/ofi: refactor pipeline send async copy Refactor the async copy before sending a chunk. --- src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c | 85 ++++++++++++++++++++++ src/mpid/ch4/netmod/ofi/ofi_impl.h | 34 ++------- 2 files changed, 93 insertions(+), 26 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c index 37d1c9862ac..cf0192b9c2d 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c +++ b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c @@ -6,6 +6,88 @@ #include "mpidimpl.h" #include "mpir_async_things.h" +/* ------------------------------------ + * send_copy: async copy before sending the chunk data + */ +struct send_copy { + MPIR_Request *sreq; + /* async handle */ + MPIR_async_req async_req; + /* for sending data */ + const void *buf; + MPI_Aint chunk_sz; +}; + +static int send_copy_poll(MPIR_Async_thing * thing); +static void send_copy_complete(MPIR_Request * sreq, const void *buf, MPI_Aint chunk_sz); + +int MPIDI_OFI_gpu_pipeline_send_copy(MPIR_Request * sreq, MPIR_async_req * areq, + const void *buf, MPI_Aint chunk_sz) +{ + int mpi_errno = MPI_SUCCESS; + + struct send_copy *p; + p = MPL_malloc(sizeof(*p), MPL_MEM_OTHER); + MPIR_Assert(p); + + p->sreq = sreq; + p->async_req = *areq; + p->buf = buf; + p->chunk_sz = chunk_sz; + + mpi_errno = MPIR_Async_things_add(send_copy_complete, p); + + return mpi_errno; +} + +static int send_copy_poll(MPIR_Async_thing * thing) +{ + int is_done = 0; + + struct send_copy *p = MPIR_Async_thing_get_state(thing); + MPIR_async_test(&(p->async_req), &is_done); + + if (is_done) { + /* finished copy, go ahead send the data */ + send_copy_complete(p->sreq, p->buf, p->chunk_sz); + MPL_free(p); + return MPIR_ASYNC_THING_DONE; + } + + return MPIR_ASYNC_THING_NOPROGRESS; +} + +static void send_copy_complete(MPIR_Request * sreq, const void *buf, MPI_Aint chunk_sz) +{ + int mpi_errno = MPI_SUCCESS; + int vci_local = MPIDI_OFI_REQUEST(sreq, pipeline_info.vci_local); + + MPIDI_OFI_gpu_pipeline_request *chunk_req = (MPIDI_OFI_gpu_pipeline_request *) + MPL_malloc(sizeof(MPIDI_OFI_gpu_pipeline_request), MPL_MEM_BUFFER); + MPIR_Assertp(chunk_req); + + chunk_req->parent = sreq; + chunk_req->event_id = MPIDI_OFI_EVENT_SEND_GPU_PIPELINE; + chunk_req->buf = (void *) buf; + + int ctx_idx = MPIDI_OFI_REQUEST(sreq, pipeline_info.ctx_idx); + fi_addr_t remote_addr = MPIDI_OFI_REQUEST(sreq, pipeline_info.remote_addr); + uint64_t cq_data = MPIDI_OFI_REQUEST(sreq, pipeline_info.cq_data); + uint64_t match_bits = MPIDI_OFI_REQUEST(sreq, pipeline_info.match_bits); + match_bits |= MPIDI_OFI_GPU_PIPELINE_SEND; + MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(vci_local).lock); + MPIDI_OFI_CALL_RETRY(fi_tsenddata(MPIDI_OFI_global.ctx[ctx_idx].tx, + buf, chunk_sz, NULL /* desc */ , + cq_data, remote_addr, match_bits, + (void *) &chunk_req->context), vci_local, fi_tsenddata); + MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(vci_local).lock); + /* both send buffer and chunk_req will be freed in pipeline_send_event */ + + return; + fn_fail: + MPIR_Assert(0); +} + /* ------------------------------------ * recv_copy: async copy from host_buf to user buffer in recv event */ @@ -47,7 +129,10 @@ int MPIDI_OFI_gpu_pipeline_recv_copy(MPIR_Request * rreq, void *buf, MPI_Aint ch mpi_errno = MPIR_Async_things_add(recv_copy_poll, p); + fn_exit: return mpi_errno; + fn_fail: + goto fn_exit; } static int recv_copy_poll(MPIR_Async_thing * thing) diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h index c15eedf4e5f..0939045a8a7 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_impl.h +++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h @@ -827,6 +827,8 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_free_pack_buffer(void *ptr) } } +int MPIDI_OFI_gpu_pipeline_send_copy(MPIR_Request * sreq, MPIR_async_req * areq, + void *buf, MPI_Aint size); int MPIDI_OFI_gpu_pipeline_recv_copy(MPIR_Request * rreq, void *buf, MPI_Aint chunk_sz, void *recv_buf, MPI_Aint count, MPI_Datatype datatype); @@ -916,7 +918,6 @@ static int MPIDI_OFI_gpu_progress_send(void) MPI_Datatype datatype = MPIDI_OFI_REQUEST(send_task->sreq, datatype); int block_sz = MPIDI_OFI_REQUEST(send_task->sreq, pipeline_info.chunk_sz); while (send_task->left_sz > 0) { - MPIDI_OFI_gpu_task_t *task = NULL; chunk_sz = send_task->left_sz > block_sz ? block_sz : send_task->left_sz; host_buf = NULL; MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global.gpu_pipeline_send_pool, @@ -938,19 +939,18 @@ static int MPIDI_OFI_gpu_progress_send(void) MPI_BYTE, 0, NULL, MPL_GPU_COPY_D2H, engine_type, commit, &async_req); MPIR_ERR_CHECK(mpi_errno); + actual_pack_bytes = chunk_sz; - task = - MPIDI_OFI_create_gpu_task(MPIDI_OFI_PIPELINE_SEND, host_buf, actual_pack_bytes, - send_task->sreq, async_req); send_task->offset += (size_t) actual_pack_bytes; send_task->left_sz -= (size_t) actual_pack_bytes; - vci_local = MPIDI_OFI_REQUEST(send_task->sreq, pipeline_info.vci_local); - MPIR_Assert(vci_local < MPIDI_CH4_MAX_VCIS); - DL_APPEND(MPIDI_OFI_global.gpu_send_task_queue[vci_local], task); send_task->n_chunks++; /* Increase request completion cnt, cc is 1 more than necessary * to prevent parent request being freed prematurally. */ MPIR_cc_inc(send_task->sreq->cc_ptr); + + mpi_errno = MPIDI_OFI_gpu_pipeline_send_copy(send_task->sreq, &yreq, + host_buf, chunk_sz); + MPIR_ERR_CHECK(mpi_errno); } /* all done, decrease cc by 1 to allow parent request to be freed * when complete */ @@ -1046,25 +1046,7 @@ static int MPIDI_OFI_gpu_progress_task(MPIDI_OFI_gpu_task_t * gpu_queue[], int v MPIR_Request *request = task->request; if (task->type == MPIDI_OFI_PIPELINE_SEND) { - MPIDI_OFI_gpu_pipeline_request *chunk_req = (MPIDI_OFI_gpu_pipeline_request *) - MPL_malloc(sizeof(MPIDI_OFI_gpu_pipeline_request), MPL_MEM_BUFFER); - MPIR_ERR_CHKANDJUMP1(chunk_req == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem", - "**nomem %s", "GPU pipelining chunk_req alloc"); - chunk_req->parent = request; - chunk_req->event_id = MPIDI_OFI_EVENT_SEND_GPU_PIPELINE; - chunk_req->buf = task->buf; - MPIDI_OFI_CALL_RETRY(fi_tsenddata - (MPIDI_OFI_global.ctx - [MPIDI_OFI_REQUEST(request, pipeline_info.ctx_idx)].tx, - task->buf, task->len, NULL /* desc */ , - MPIDI_OFI_REQUEST(request, pipeline_info.cq_data), - MPIDI_OFI_REQUEST(request, pipeline_info.remote_addr), - MPIDI_OFI_REQUEST(request, - pipeline_info.match_bits) | - MPIDI_OFI_GPU_PIPELINE_SEND, (void *) &chunk_req->context), - vni, fi_tsenddata); - DL_DELETE(gpu_queue[vni], task); - MPL_free(task); + MPIR_Assert(0); } else { MPIR_Assert(0); } From 70469a44e12160ececc646cea28120f427973963 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 31 Jan 2024 14:14:31 -0600 Subject: [PATCH 06/17] ch4/ofi: remove MPIDI_OFI_gpu_progress_task Both gpu_send_task_queue and gpu_recv_task_queue have been ported to async things. --- src/mpid/ch4/netmod/ofi/ofi_impl.h | 54 ----------------------------- src/mpid/ch4/netmod/ofi/ofi_types.h | 2 -- 2 files changed, 56 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h index 0939045a8a7..f8d277a0269 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_impl.h +++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h @@ -902,8 +902,6 @@ MPL_STATIC_INLINE_PREFIX MPIDI_OFI_gpu_pending_send_t *MPIDI_OFI_create_send_tas return task; } -static int MPIDI_OFI_gpu_progress_task(MPIDI_OFI_gpu_task_t * gpu_queue[], int vni); - static int MPIDI_OFI_gpu_progress_send(void) { int mpi_errno = MPI_SUCCESS; @@ -961,10 +959,6 @@ static int MPIDI_OFI_gpu_progress_send(void) send_task->n_chunks); DL_DELETE(MPIDI_OFI_global.gpu_send_queue, send_task); MPL_free(send_task); - - if (vci_local != -1) - MPIDI_OFI_gpu_progress_task(MPIDI_OFI_global.gpu_send_task_queue, vci_local); - } fn_exit: @@ -1018,58 +1012,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_progress_recv(void) goto fn_exit; } -static int MPIDI_OFI_gpu_progress_task(MPIDI_OFI_gpu_task_t * gpu_queue[], int vni) -{ - int mpi_errno = MPI_SUCCESS; - MPIDI_OFI_gpu_task_t *task = NULL; - MPIDI_OFI_gpu_task_t *tmp; - - DL_FOREACH_SAFE(gpu_queue[vni], task, tmp) { - if (task->status == MPIDI_OFI_PIPELINE_EXEC) { - /* Avoid the deadlock of re-launching an executing OFI task. */ - goto fn_exit; - } - - MPIR_async_req *areq = &task->async_req; - int completed = 0; - if (areq->type == MPIR_GPU_REQUEST) { - mpi_errno = MPL_gpu_test(&areq->u.gpu_req, &completed); - MPIR_ERR_CHECK(mpi_errno); - } else if (areq->type == MPIR_TYPEREP_REQUEST) { - MPIR_Typerep_test(areq->u.y_req, &completed); - } else { - completed = 1; - } - if (completed == 1) { - /* GPU transfer completes. */ - task->status = MPIDI_OFI_PIPELINE_EXEC; - MPIR_Request *request = task->request; - - if (task->type == MPIDI_OFI_PIPELINE_SEND) { - MPIR_Assert(0); - } else { - MPIR_Assert(0); - } - } else { - goto fn_exit; - } - } - - fn_exit: - return mpi_errno; - fn_fail: - mpi_errno = MPI_ERR_OTHER; - goto fn_exit; -} - MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_progress(int vni) { int mpi_errno = MPI_SUCCESS; - mpi_errno = MPIDI_OFI_gpu_progress_task(MPIDI_OFI_global.gpu_recv_task_queue, vni); - MPIR_ERR_CHECK(mpi_errno); - mpi_errno = MPIDI_OFI_gpu_progress_task(MPIDI_OFI_global.gpu_send_task_queue, vni); - MPIR_ERR_CHECK(mpi_errno); mpi_errno = MPIDI_OFI_gpu_progress_send(); MPIR_ERR_CHECK(mpi_errno); mpi_errno = MPIDI_OFI_gpu_progress_recv(); diff --git a/src/mpid/ch4/netmod/ofi/ofi_types.h b/src/mpid/ch4/netmod/ofi/ofi_types.h index 77dbcf8a90c..c92de0b5640 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_types.h +++ b/src/mpid/ch4/netmod/ofi/ofi_types.h @@ -491,8 +491,6 @@ typedef struct { /* GPU pipeline */ MPIDU_genq_private_pool_t gpu_pipeline_send_pool; MPIDU_genq_private_pool_t gpu_pipeline_recv_pool; - MPIDI_OFI_gpu_task_t *gpu_send_task_queue[MPIDI_CH4_MAX_VCIS]; - MPIDI_OFI_gpu_task_t *gpu_recv_task_queue[MPIDI_CH4_MAX_VCIS]; MPIDI_OFI_gpu_pending_recv_t *gpu_recv_queue; MPIDI_OFI_gpu_pending_send_t *gpu_send_queue; From 0ee16a63bb5de0e73f95552aad59ce2a76b3e232 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 31 Jan 2024 17:43:27 -0600 Subject: [PATCH 07/17] ch4/ofi: refactor pipeline send Pipeline send allocates chunk buffers then spawns async copy. The allocation may run out of genq buffers, thus it is disigned as async tasks. The send copy are triggered upon completion of buffer alloc, thus it is renamed into spawn_send_copy and turned into internal static function. This removes MPIDI_OFI_global.gpu_send_queue. --- src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c | 94 ++++++++++++++++++-- src/mpid/ch4/netmod/ofi/ofi_impl.h | 99 +--------------------- src/mpid/ch4/netmod/ofi/ofi_init.c | 1 - src/mpid/ch4/netmod/ofi/ofi_send.h | 5 +- src/mpid/ch4/netmod/ofi/ofi_types.h | 1 - 5 files changed, 91 insertions(+), 109 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c index cf0192b9c2d..c61d9f2deef 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c +++ b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c @@ -6,6 +6,89 @@ #include "mpidimpl.h" #include "mpir_async_things.h" +/* ------------------------------------ + * send_alloc: allocate send chunks and start copy, may postpone as async + */ +struct send_alloc { + MPIR_Request *sreq; + const void *send_buf; + MPI_Aint count; + MPI_Datatype datatype; + MPL_pointer_attr_t attr; + MPI_Aint offset, left_sz; + int n_chunks; +}; + +static int send_alloc_poll(MPIR_Async_thing * thing); + +int MPIDI_OFI_gpu_pipeline_send(MPIR_Request * sreq, const void *send_buf, + MPI_Aint count, MPI_Datatype datatype, + MPL_pointer_attr_t attr, MPI_Aint data_sz) +{ + int mpi_errno = MPI_SUCCESS; + + struct send_alloc *p; + p = MPL_malloc(sizeof(*p), MPL_MEM_OTHER); + MPIR_Assert(p); + + p->sreq = sreq; + p->send_buf = send_buf; + p->count = count; + p->datatype = datatype; + p->attr = attr; + p->left_sz = data_sz; + p->offset = 0; + p->n_chunks = 0; + + mpi_errno = MPIR_Async_things_add(send_alloc_poll, p); + + return mpi_errno; +} + +static int send_alloc_poll(MPIR_Async_thing * thing) +{ + int num_new_chunks = 0; + struct send_alloc *p = MPIR_Async_thing_get_state(thing); + + while (p->left_sz > 0) { + void *host_buf; + MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global.gpu_pipeline_send_pool, &host_buf); + if (host_buf == NULL) { + return (num_new_chunks == 0) ? MPIR_ASYNC_THING_NOPROGRESS : MPIR_ASYNC_THING_UPDATED; + } + MPIR_async_req async_req; + MPI_Aint chunk_sz = MPL_MIN(p->left_sz, MPIDI_OFI_REQUEST(p->sreq, pipeline_info.chunk_sz)); + MPL_gpu_engine_type_t engine_type = + MPIDI_OFI_gpu_get_send_engine_type(MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE); + int commit = p->left_sz <= chunk_sz ? 1 : 0; + if (!commit && + !MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST && + p->n_chunks % MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK == + MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK - 1) + commit = 1; + int mpi_errno; + mpi_errno = MPIR_Ilocalcopy_gpu(p->send_buf, p->count, p->datatype, + p->offset, &p->attr, host_buf, chunk_sz, + MPI_BYTE, 0, NULL, MPL_GPU_COPY_D2H, engine_type, + commit, &async_req); + MPIR_Assertp(mpi_errno == MPI_SUCCESS); + + p->offset += (size_t) chunk_sz; + p->left_sz -= (size_t) chunk_sz; + p->n_chunks++; + /* Increase request completion cnt, cc is 1 more than necessary + * to prevent parent request being freed prematurally. */ + MPIR_cc_inc(p->sreq->cc_ptr); + + spawn_pipeline_send(thing, p->sreq, &async_req, host_buf, chunk_sz); + + num_new_chunks++; + } + /* all done */ + MPL_free(p); + return MPIR_ASYNC_THING_DONE; +}; + /* ------------------------------------ * send_copy: async copy before sending the chunk data */ @@ -21,11 +104,10 @@ struct send_copy { static int send_copy_poll(MPIR_Async_thing * thing); static void send_copy_complete(MPIR_Request * sreq, const void *buf, MPI_Aint chunk_sz); -int MPIDI_OFI_gpu_pipeline_send_copy(MPIR_Request * sreq, MPIR_async_req * areq, - const void *buf, MPI_Aint chunk_sz) +static void spawn_send_copy(MPIR_Async_thing * thing, + MPIR_Request * sreq, MPIR_async_req * areq, + const void *buf, MPI_Aint chunk_sz) { - int mpi_errno = MPI_SUCCESS; - struct send_copy *p; p = MPL_malloc(sizeof(*p), MPL_MEM_OTHER); MPIR_Assert(p); @@ -35,9 +117,7 @@ int MPIDI_OFI_gpu_pipeline_send_copy(MPIR_Request * sreq, MPIR_async_req * areq, p->buf = buf; p->chunk_sz = chunk_sz; - mpi_errno = MPIR_Async_things_add(send_copy_complete, p); - - return mpi_errno; + MPIR_Async_thing_spawn(thing, send_copy_poll, p); } static int send_copy_poll(MPIR_Async_thing * thing) diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h index f8d277a0269..40073c4fdde 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_impl.h +++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h @@ -827,8 +827,9 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_free_pack_buffer(void *ptr) } } -int MPIDI_OFI_gpu_pipeline_send_copy(MPIR_Request * sreq, MPIR_async_req * areq, - void *buf, MPI_Aint size); +int MPIDI_OFI_gpu_pipeline_send(MPIR_Request * sreq, const void *send_buf, + MPI_Aint count, MPI_Datatype datatype, + MPL_pointer_attr_t attr, MPI_Aint data_sz); int MPIDI_OFI_gpu_pipeline_recv_copy(MPIR_Request * rreq, void *buf, MPI_Aint chunk_sz, void *recv_buf, MPI_Aint count, MPI_Datatype datatype); @@ -876,98 +877,6 @@ MPL_STATIC_INLINE_PREFIX MPIDI_OFI_gpu_pending_recv_t return task; } -MPL_STATIC_INLINE_PREFIX MPIDI_OFI_gpu_pending_send_t *MPIDI_OFI_create_send_task(MPIR_Request * - req, - void *send_buf, - MPL_pointer_attr_t - attr, - MPI_Aint left_sz, - MPI_Aint count, - int dt_contig) -{ - MPIDI_OFI_gpu_pending_send_t *task = - (MPIDI_OFI_gpu_pending_send_t *) MPL_malloc(sizeof(MPIDI_OFI_gpu_pending_send_t), - MPL_MEM_OTHER); - MPIR_Assert(task); - task->sreq = req; - task->attr = attr; - task->send_buf = send_buf; - task->offset = 0; - task->n_chunks = 0; - task->left_sz = left_sz; - task->count = count; - task->dt_contig = dt_contig; - task->prev = NULL; - task->next = NULL; - return task; -} - -static int MPIDI_OFI_gpu_progress_send(void) -{ - int mpi_errno = MPI_SUCCESS; - int engine_type = MPIR_CVAR_CH4_OFI_GPU_PIPELINE_D2H_ENGINE_TYPE; - - while (MPIDI_OFI_global.gpu_send_queue) { - char *host_buf = NULL; - MPI_Aint chunk_sz; - int vci_local = -1; - - MPIDI_OFI_gpu_pending_send_t *send_task = MPIDI_OFI_global.gpu_send_queue; - MPI_Datatype datatype = MPIDI_OFI_REQUEST(send_task->sreq, datatype); - int block_sz = MPIDI_OFI_REQUEST(send_task->sreq, pipeline_info.chunk_sz); - while (send_task->left_sz > 0) { - chunk_sz = send_task->left_sz > block_sz ? block_sz : send_task->left_sz; - host_buf = NULL; - MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global.gpu_pipeline_send_pool, - (void **) &host_buf); - if (host_buf == NULL) { - goto fn_exit; - } - MPI_Aint actual_pack_bytes; - MPIR_async_req async_req; - int commit = send_task->left_sz <= chunk_sz ? 1 : 0; - if (!commit && - !MPIR_CVAR_GPU_USE_IMMEDIATE_COMMAND_LIST && - send_task->n_chunks % MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK == - MPIR_CVAR_CH4_OFI_GPU_PIPELINE_NUM_BUFFERS_PER_CHUNK - 1) - commit = 1; - mpi_errno = - MPIR_Ilocalcopy_gpu((char *) send_task->send_buf, send_task->count, datatype, - send_task->offset, &send_task->attr, host_buf, chunk_sz, - MPI_BYTE, 0, NULL, MPL_GPU_COPY_D2H, engine_type, - commit, &async_req); - MPIR_ERR_CHECK(mpi_errno); - - actual_pack_bytes = chunk_sz; - send_task->offset += (size_t) actual_pack_bytes; - send_task->left_sz -= (size_t) actual_pack_bytes; - send_task->n_chunks++; - /* Increase request completion cnt, cc is 1 more than necessary - * to prevent parent request being freed prematurally. */ - MPIR_cc_inc(send_task->sreq->cc_ptr); - - mpi_errno = MPIDI_OFI_gpu_pipeline_send_copy(send_task->sreq, &yreq, - host_buf, chunk_sz); - MPIR_ERR_CHECK(mpi_errno); - } - /* all done, decrease cc by 1 to allow parent request to be freed - * when complete */ - MPIR_cc_dec(send_task->sreq->cc_ptr); - /* Update correct number of chunks in immediate data. */ - MPIDI_OFI_idata_set_gpuchunk_bits(&MPIDI_OFI_REQUEST - (send_task->sreq, pipeline_info.cq_data), - send_task->n_chunks); - DL_DELETE(MPIDI_OFI_global.gpu_send_queue, send_task); - MPL_free(send_task); - } - - fn_exit: - return mpi_errno; - fn_fail: - mpi_errno = MPI_ERR_OTHER; - goto fn_exit; -} - MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_progress_recv(void) { int mpi_errno = MPI_SUCCESS; @@ -1016,8 +925,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_progress(int vni) { int mpi_errno = MPI_SUCCESS; - mpi_errno = MPIDI_OFI_gpu_progress_send(); - MPIR_ERR_CHECK(mpi_errno); mpi_errno = MPIDI_OFI_gpu_progress_recv(); MPIR_ERR_CHECK(mpi_errno); diff --git a/src/mpid/ch4/netmod/ofi/ofi_init.c b/src/mpid/ch4/netmod/ofi/ofi_init.c index 134a62cea4e..8bf54ef73e2 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_init.c +++ b/src/mpid/ch4/netmod/ofi/ofi_init.c @@ -734,7 +734,6 @@ int MPIDI_OFI_init_local(int *tag_bits) host_free_registered, &MPIDI_OFI_global.gpu_pipeline_recv_pool); MPIR_ERR_CHECK(mpi_errno); - MPIDI_OFI_global.gpu_send_queue = NULL; MPIDI_OFI_global.gpu_recv_queue = NULL; } diff --git a/src/mpid/ch4/netmod/ofi/ofi_send.h b/src/mpid/ch4/netmod/ofi/ofi_send.h index fc3eb86315f..9d72d3f5f0d 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_send.h +++ b/src/mpid/ch4/netmod/ofi/ofi_send.h @@ -309,10 +309,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_normal(const void *buf, MPI_Aint cou match_bits), vci_local, tinjectdata); MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_sent_bytes_count[sender_nic], data_sz); - MPIDI_OFI_gpu_pending_send_t *send_task = - MPIDI_OFI_create_send_task(sreq, (void *) buf, attr, data_sz, count, dt_contig); - DL_APPEND(MPIDI_OFI_global.gpu_send_queue, send_task); - MPIDI_OFI_gpu_progress_send(); + MPIDI_OFI_gpu_pipeline_send(sreq, buf, count, datatype, attr, data_sz); goto fn_exit; } diff --git a/src/mpid/ch4/netmod/ofi/ofi_types.h b/src/mpid/ch4/netmod/ofi/ofi_types.h index c92de0b5640..f9ebe61bfcc 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_types.h +++ b/src/mpid/ch4/netmod/ofi/ofi_types.h @@ -492,7 +492,6 @@ typedef struct { MPIDU_genq_private_pool_t gpu_pipeline_send_pool; MPIDU_genq_private_pool_t gpu_pipeline_recv_pool; MPIDI_OFI_gpu_pending_recv_t *gpu_recv_queue; - MPIDI_OFI_gpu_pending_send_t *gpu_send_queue; /* Process management and PMI globals */ int pname_set; From e297eeeb836b1c991036422e85fd2d5d6f083d10 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 31 Jan 2024 23:20:01 -0600 Subject: [PATCH 08/17] ch4/ofi: refactor pipeline recv Pipeline recv allocates chunk buffers and then post fi_trecv. The allocation may run out of genq buffers and we also control the number of outstanding recvs, thus it is designed as async tasks. The async recv copy are triggered in recv event when data arrives. This removes MPIDI_OFI_global.gpu_recv_queue. All ofi-layer progress routines for gpu pipelining are now removed. --- src/mpid/ch4/netmod/ofi/ofi_events.c | 43 +--------- src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c | 85 +++++++++++++++++++- src/mpid/ch4/netmod/ofi/ofi_impl.h | 93 +--------------------- src/mpid/ch4/netmod/ofi/ofi_init.c | 1 - src/mpid/ch4/netmod/ofi/ofi_progress.h | 7 +- src/mpid/ch4/netmod/ofi/ofi_recv.h | 30 +------ src/mpid/ch4/netmod/ofi/ofi_types.h | 1 - 7 files changed, 90 insertions(+), 170 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.c b/src/mpid/ch4/netmod/ofi/ofi_events.c index ba70becf599..85042ce27a3 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_events.c +++ b/src/mpid/ch4/netmod/ofi/ofi_events.c @@ -127,8 +127,6 @@ static int pipeline_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r, size_t recv_count = MPIDI_OFI_REQUEST(rreq, noncontig.pack.count); MPI_Datatype datatype = MPIDI_OFI_REQUEST(rreq, noncontig.pack.datatype); - fi_addr_t remote_addr = MPIDI_OFI_REQUEST(rreq, pipeline_info.remote_addr); - if (event_id == MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT) { rreq->status.MPI_SOURCE = MPIDI_OFI_cqe_get_source(wc, true); rreq->status.MPI_ERROR = MPIDI_OFI_idata_get_error_bits(wc->data); @@ -154,45 +152,8 @@ static int pipeline_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r, /* Post recv for remaining chunks. */ MPIR_cc_dec(rreq->cc_ptr); for (i = 0; i < n_chunks; i++) { - int c; - MPIR_cc_incr(rreq->cc_ptr, &c); - - size_t chunk_sz = MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ; - - char *host_buf = NULL; - MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, - (void **) &host_buf); - - MPIDI_OFI_REQUEST(rreq, event_id) = MPIDI_OFI_EVENT_RECV_GPU_PIPELINE; - - MPIDI_OFI_gpu_pipeline_request *chunk_req = NULL; - chunk_req = (MPIDI_OFI_gpu_pipeline_request *) - MPL_malloc(sizeof(MPIDI_OFI_gpu_pipeline_request), MPL_MEM_BUFFER); - if (chunk_req == NULL) { - mpi_errno = MPIR_ERR_OTHER; - goto fn_fail; - } - chunk_req->event_id = MPIDI_OFI_EVENT_RECV_GPU_PIPELINE; - chunk_req->parent = rreq; - chunk_req->buf = host_buf; - int ret = 0; - if (!MPIDI_OFI_global.gpu_recv_queue && host_buf) { - ret = fi_trecv - (MPIDI_OFI_global.ctx - [MPIDI_OFI_REQUEST(rreq, pipeline_info.ctx_idx)].rx, - host_buf, chunk_sz, NULL, remote_addr, - MPIDI_OFI_REQUEST(rreq, - pipeline_info.match_bits) | - MPIDI_OFI_GPU_PIPELINE_SEND, MPIDI_OFI_REQUEST(rreq, - pipeline_info. - mask_bits), - (void *) &chunk_req->context); - } - if (MPIDI_OFI_global.gpu_recv_queue || !host_buf || ret != 0) { - MPIDI_OFI_gpu_pending_recv_t *recv_task = - MPIDI_OFI_create_recv_task(chunk_req, i, n_chunks); - DL_APPEND(MPIDI_OFI_global.gpu_recv_queue, recv_task); - } + mpi_errno = MPIDI_OFI_gpu_pipeline_recv(rreq, i, n_chunks); + MPIR_ERR_CHECK(mpi_errno); } } } else { diff --git a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c index c61d9f2deef..5b79bdd3394 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c +++ b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c @@ -20,6 +20,8 @@ struct send_alloc { }; static int send_alloc_poll(MPIR_Async_thing * thing); +static void spawn_send_copy(MPIR_Async_thing * thing, MPIR_Request * sreq, MPIR_async_req * areq, + const void *buf, MPI_Aint chunk_sz); int MPIDI_OFI_gpu_pipeline_send(MPIR_Request * sreq, const void *send_buf, MPI_Aint count, MPI_Datatype datatype, @@ -80,7 +82,7 @@ static int send_alloc_poll(MPIR_Async_thing * thing) * to prevent parent request being freed prematurally. */ MPIR_cc_inc(p->sreq->cc_ptr); - spawn_pipeline_send(thing, p->sreq, &async_req, host_buf, chunk_sz); + spawn_send_copy(thing, p->sreq, &async_req, host_buf, chunk_sz); num_new_chunks++; } @@ -168,6 +170,87 @@ static void send_copy_complete(MPIR_Request * sreq, const void *buf, MPI_Aint ch MPIR_Assert(0); } +/* ------------------------------------ + * recv_alloc: allocate recv chunk buffer and post fi_trecv + */ +struct recv_alloc { + MPIR_Request *rreq; + MPIDI_OFI_gpu_pipeline_request *chunk_req; + int idx; + int n_chunks; +}; + +static int recv_alloc_poll(MPIR_Async_thing * thing); + +int MPIDI_OFI_gpu_pipeline_recv(MPIR_Request * rreq, int idx, int n_chunks) +{ + int mpi_errno = MPI_SUCCESS; + + struct recv_alloc *p; + p = MPL_malloc(sizeof(*p), MPL_MEM_OTHER); + MPIR_Assert(p); + + p->rreq = rreq; + p->idx = idx; + p->n_chunks = n_chunks; + + mpi_errno = MPIR_Async_things_add(recv_alloc_poll, p); + + return mpi_errno; +} + +static int recv_alloc_poll(MPIR_Async_thing * thing) +{ + struct recv_alloc *p = MPIR_Async_thing_get_state(thing); + MPIR_Request *rreq = p->rreq; + + if (MPIR_cc_get(rreq->cc) > 1) { + return MPIR_ASYNC_THING_NOPROGRESS; + } + + void *host_buf; + MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, &host_buf); + if (!host_buf) { + return MPIR_ASYNC_THING_NOPROGRESS; + } + + fi_addr_t remote_addr = MPIDI_OFI_REQUEST(rreq, pipeline_info.remote_addr); + int ctx_idx = MPIDI_OFI_REQUEST(rreq, pipeline_info.ctx_idx); + int vci = MPIDI_Request_get_vci(rreq); + uint64_t match_bits = MPIDI_OFI_REQUEST(rreq, pipeline_info.match_bits); + uint64_t mask_bits = MPIDI_OFI_REQUEST(rreq, pipeline_info.mask_bits); + + MPIDI_OFI_gpu_pipeline_request *chunk_req; + chunk_req = MPL_malloc(sizeof(*chunk_req), MPL_MEM_BUFFER); + MPIR_Assert(chunk_req); + + chunk_req->parent = rreq; + chunk_req->buf = host_buf; + if (p->n_chunks == -1) { + chunk_req->event_id = MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT; + } else { + match_bits |= MPIDI_OFI_GPU_PIPELINE_SEND; + chunk_req->event_id = MPIDI_OFI_EVENT_RECV_GPU_PIPELINE; + } + MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(vci).lock); + int ret = fi_trecv(MPIDI_OFI_global.ctx[ctx_idx].rx, + host_buf, MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ, NULL, remote_addr, + match_bits, mask_bits, (void *) &chunk_req->context); + MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(vci).lock); + if (ret == 0) { + MPL_free(p); + /* chunk_req and host_buf will be freed in recv_events */ + return MPIR_ASYNC_THING_DONE; + } + if (ret != -FI_EAGAIN && ret != -FI_ENOMEM) { + /* unexpected error */ + MPIR_Assert(0); + } + MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, host_buf); + MPL_free(chunk_req); + return MPIR_ASYNC_THING_NOPROGRESS; +}; + /* ------------------------------------ * recv_copy: async copy from host_buf to user buffer in recv event */ diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h index 40073c4fdde..66478af9abe 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_impl.h +++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h @@ -830,6 +830,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_free_pack_buffer(void *ptr) int MPIDI_OFI_gpu_pipeline_send(MPIR_Request * sreq, const void *send_buf, MPI_Aint count, MPI_Datatype datatype, MPL_pointer_attr_t attr, MPI_Aint data_sz); +int MPIDI_OFI_gpu_pipeline_recv(MPIR_Request * rreq, int idx, int n_chunks); int MPIDI_OFI_gpu_pipeline_recv_copy(MPIR_Request * rreq, void *buf, MPI_Aint chunk_sz, void *recv_buf, MPI_Aint count, MPI_Datatype datatype); @@ -842,96 +843,4 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_pipeline_chunk_size(size_t data_sz) return chunk_size; } -MPL_STATIC_INLINE_PREFIX MPIDI_OFI_gpu_task_t *MPIDI_OFI_create_gpu_task(MPIDI_OFI_pipeline_type_t - type, void *buf, - size_t len, - MPIR_Request * request, - MPIR_async_req async_req) -{ - MPIDI_OFI_gpu_task_t *task = - (MPIDI_OFI_gpu_task_t *) MPL_malloc(sizeof(MPIDI_OFI_gpu_task_t), MPL_MEM_OTHER); - MPIR_Assert(task != NULL); - task->type = type; - task->status = MPIDI_OFI_PIPELINE_READY; - task->buf = buf; - task->len = len; - task->request = request; - task->areq = async_req; - task->prev = NULL; - task->next = NULL; - return task; -} - -MPL_STATIC_INLINE_PREFIX MPIDI_OFI_gpu_pending_recv_t - * MPIDI_OFI_create_recv_task(MPIDI_OFI_gpu_pipeline_request * req, int idx, int n_chunks) -{ - MPIDI_OFI_gpu_pending_recv_t *task = - (MPIDI_OFI_gpu_pending_recv_t *) MPL_malloc(sizeof(MPIDI_OFI_gpu_pending_recv_t), - MPL_MEM_OTHER); - MPIR_Assert(task); - task->req = req; - task->idx = idx; - task->n_chunks = n_chunks; - task->prev = NULL; - task->next = NULL; - return task; -} - -MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_progress_recv(void) -{ - int mpi_errno = MPI_SUCCESS; - - while (MPIDI_OFI_global.gpu_recv_queue) { - MPIDI_OFI_gpu_pending_recv_t *recv_task = MPIDI_OFI_global.gpu_recv_queue; - MPIDI_OFI_gpu_pipeline_request *chunk_req = recv_task->req; - MPIR_Request *rreq = chunk_req->parent; - void *host_buf = chunk_req->buf; - if (!host_buf) { - MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, - (void **) &host_buf); - if (!host_buf) { - break; - } - chunk_req->buf = host_buf; - } - fi_addr_t remote_addr = MPIDI_OFI_REQUEST(rreq, pipeline_info.remote_addr); - - int ret = fi_trecv(MPIDI_OFI_global.ctx[MPIDI_OFI_REQUEST(rreq, pipeline_info.ctx_idx)].rx, - (void *) host_buf, - MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ, NULL, remote_addr, - MPIDI_OFI_REQUEST(rreq, - pipeline_info.match_bits) | - MPIDI_OFI_GPU_PIPELINE_SEND, - MPIDI_OFI_REQUEST(rreq, pipeline_info.mask_bits), - (void *) &chunk_req->context); - if (ret == 0) { - DL_DELETE(MPIDI_OFI_global.gpu_recv_queue, recv_task); - MPL_free(recv_task); - } else if (ret == -FI_EAGAIN || ret == -FI_ENOMEM) { - break; - } else { - goto fn_fail; - } - } - - fn_exit: - return mpi_errno; - fn_fail: - mpi_errno = MPI_ERR_OTHER; - goto fn_exit; -} - -MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_progress(int vni) -{ - int mpi_errno = MPI_SUCCESS; - - mpi_errno = MPIDI_OFI_gpu_progress_recv(); - MPIR_ERR_CHECK(mpi_errno); - - fn_exit: - return mpi_errno; - fn_fail: - goto fn_exit; -} - #endif /* OFI_IMPL_H_INCLUDED */ diff --git a/src/mpid/ch4/netmod/ofi/ofi_init.c b/src/mpid/ch4/netmod/ofi/ofi_init.c index 8bf54ef73e2..59131aea264 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_init.c +++ b/src/mpid/ch4/netmod/ofi/ofi_init.c @@ -734,7 +734,6 @@ int MPIDI_OFI_init_local(int *tag_bits) host_free_registered, &MPIDI_OFI_global.gpu_pipeline_recv_pool); MPIR_ERR_CHECK(mpi_errno); - MPIDI_OFI_global.gpu_recv_queue = NULL; } /* Initialize RMA keys allocator */ diff --git a/src/mpid/ch4/netmod/ofi/ofi_progress.h b/src/mpid/ch4/netmod/ofi/ofi_progress.h index bf87b13ca88..7a24e0f8562 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_progress.h +++ b/src/mpid/ch4/netmod/ofi/ofi_progress.h @@ -82,12 +82,9 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_progress(int vci, int *made_progress) * to do, so simply return. * NOTE: it is not an error since global progress will poll every vci. */ - return MPI_SUCCESS; + goto fn_exit; } - mpi_errno = MPIDI_OFI_gpu_progress(vci); - MPIR_ERR_CHECK(mpi_errno); - if (unlikely(MPIDI_OFI_has_cq_buffered(vci))) { int num = MPIDI_OFI_get_buffered(vci, wc); mpi_errno = MPIDI_OFI_handle_cq_entries(vci, wc, num); @@ -113,8 +110,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_progress(int vci, int *made_progress) fn_exit: MPIR_FUNC_EXIT; return mpi_errno; - fn_fail: - goto fn_exit; } #endif /* OFI_PROGRESS_H_INCLUDED */ diff --git a/src/mpid/ch4/netmod/ofi/ofi_recv.h b/src/mpid/ch4/netmod/ofi/ofi_recv.h index fd00fc42fbb..9beebd7f579 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_recv.h +++ b/src/mpid/ch4/netmod/ofi/ofi_recv.h @@ -222,14 +222,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_irecv(void *buf, data_sz >= MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD) { /* Pipeline path */ MPL_atomic_relaxed_store_int(&MPIDI_OFI_REQUEST(rreq, util_id), context_id); - MPIDI_OFI_REQUEST(rreq, event_id) = MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT; - /* Only post first recv with pipeline chunk size. */ - char *host_buf = NULL; - MPIDU_genq_private_pool_force_alloc_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, - (void **) &host_buf); - MPIR_ERR_CHKANDJUMP1(host_buf == NULL, mpi_errno, - MPI_ERR_OTHER, "**nomem", "**nomem %s", - "Pipeline Init recv alloc"); fi_addr_t remote_addr; if (MPI_ANY_SOURCE == rank) @@ -252,7 +244,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_irecv(void *buf, MPIDI_OFI_REQUEST(rreq, pipeline_info.ctx_idx) = ctx_idx; /* Save original buf, datatype and count */ - MPIDI_OFI_REQUEST(rreq, noncontig.pack.pack_buffer) = host_buf; MPIDI_OFI_REQUEST(rreq, noncontig.pack.buf) = buf; MPIDI_OFI_REQUEST(rreq, noncontig.pack.count) = count; MPIDI_OFI_REQUEST(rreq, noncontig.pack.datatype) = datatype; @@ -262,24 +253,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_irecv(void *buf, MPIR_Comm_add_ref(comm); } - MPIDI_OFI_gpu_pipeline_request *chunk_req; - chunk_req = (MPIDI_OFI_gpu_pipeline_request *) - MPL_malloc(sizeof(MPIDI_OFI_gpu_pipeline_request), MPL_MEM_BUFFER); - MPIR_ERR_CHKANDJUMP1(chunk_req == NULL, mpi_errno, - MPI_ERR_OTHER, "**nomem", "**nomem %s", "Recv chunk_req alloc"); - chunk_req->event_id = MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT; - chunk_req->parent = rreq; - chunk_req->buf = host_buf; - int ret = 0; - ret = fi_trecv(MPIDI_OFI_global.ctx[ctx_idx].rx, - host_buf, - MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ, - NULL, remote_addr, match_bits, mask_bits, (void *) &chunk_req->context); - if (MPIDI_OFI_global.gpu_recv_queue || !host_buf || ret != 0) { - MPIDI_OFI_gpu_pending_recv_t *recv_task = - MPIDI_OFI_create_recv_task(chunk_req, 0, -1); - DL_APPEND(MPIDI_OFI_global.gpu_recv_queue, recv_task); - } + mpi_errno = MPIDI_OFI_gpu_pipeline_recv(rreq, 0, -1); goto fn_exit; } @@ -475,7 +449,7 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_NM_mpi_cancel_recv(MPIR_Request * rreq, bool fi_cancel((fid_t) MPIDI_OFI_global.ctx[ctx_idx].rx, &(MPIDI_OFI_REQUEST(rreq, context))); if (is_blocking) { - /* progress until the rreq complets, either with cancel-bit set, + /* progress until the rreq completes, either with cancel-bit set, * or with message received */ while (!MPIR_cc_is_complete(&rreq->cc)) { mpi_errno = MPIDI_OFI_progress_uninlined(vci); diff --git a/src/mpid/ch4/netmod/ofi/ofi_types.h b/src/mpid/ch4/netmod/ofi/ofi_types.h index f9ebe61bfcc..0466c613e7f 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_types.h +++ b/src/mpid/ch4/netmod/ofi/ofi_types.h @@ -491,7 +491,6 @@ typedef struct { /* GPU pipeline */ MPIDU_genq_private_pool_t gpu_pipeline_send_pool; MPIDU_genq_private_pool_t gpu_pipeline_recv_pool; - MPIDI_OFI_gpu_pending_recv_t *gpu_recv_queue; /* Process management and PMI globals */ int pname_set; From 1756c227fbd4c2b5c54f4fb69bd0d4505c94ec14 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Mon, 5 Feb 2024 10:43:37 -0600 Subject: [PATCH 09/17] ch4/ofi: move gpu pipeline events into ofi_gpu_pipeline.c Consolidate the gpu pipeline code. MPIDI_OFI_gpu_pipeline_request is now an internal struct in ofi_gpu_pipeline.c, rename to struct chunk_req. MPIDI_OFI_gpu_pipeline_recv_copy is now an internal function, rename to start_recv_copy. --- src/mpid/ch4/netmod/ofi/ofi_events.c | 102 +------------------ src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c | 110 +++++++++++++++++++-- src/mpid/ch4/netmod/ofi/ofi_impl.h | 4 +- src/mpid/ch4/netmod/ofi/ofi_types.h | 37 ------- 4 files changed, 107 insertions(+), 146 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.c b/src/mpid/ch4/netmod/ofi/ofi_events.c index 85042ce27a3..4f31c755ed3 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_events.c +++ b/src/mpid/ch4/netmod/ofi/ofi_events.c @@ -80,102 +80,6 @@ static int peek_empty_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request return MPI_SUCCESS; } -/* GPU pipeline events */ -static int pipeline_send_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r) -{ - int mpi_errno = MPI_SUCCESS; - int c; - MPIDI_OFI_gpu_pipeline_request *req; - MPIR_Request *sreq; - void *wc_buf = NULL; - MPIR_FUNC_ENTER; - - req = (MPIDI_OFI_gpu_pipeline_request *) r; - /* get original mpi request */ - sreq = req->parent; - wc_buf = req->buf; - MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.gpu_pipeline_send_pool, wc_buf); - - MPIR_cc_decr(sreq->cc_ptr, &c); - if (c == 0) { - MPIR_Datatype_release_if_not_builtin(MPIDI_OFI_REQUEST(sreq, datatype)); - MPIR_Request_free(sreq); - } - MPL_free(r); - - MPIR_FUNC_EXIT; - return mpi_errno; -} - -static int pipeline_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r, int event_id) -{ - int mpi_errno = MPI_SUCCESS; - int i; - MPIDI_OFI_gpu_pipeline_request *req; - MPIR_Request *rreq; - void *wc_buf = NULL; - int in_use MPL_UNUSED; - - MPIR_FUNC_ENTER; - - req = (MPIDI_OFI_gpu_pipeline_request *) r; - rreq = req->parent; - wc_buf = req->buf; - MPL_free(r); - - void *recv_buf = MPIDI_OFI_REQUEST(rreq, noncontig.pack.buf); - size_t recv_count = MPIDI_OFI_REQUEST(rreq, noncontig.pack.count); - MPI_Datatype datatype = MPIDI_OFI_REQUEST(rreq, noncontig.pack.datatype); - - if (event_id == MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT) { - rreq->status.MPI_SOURCE = MPIDI_OFI_cqe_get_source(wc, true); - rreq->status.MPI_ERROR = MPIDI_OFI_idata_get_error_bits(wc->data); - rreq->status.MPI_TAG = MPIDI_OFI_init_get_tag(wc->tag); - - if (unlikely(MPIDI_OFI_is_tag_sync(wc->tag))) { - MPIDI_OFI_REQUEST(rreq, pipeline_info.is_sync) = true; - } - - uint32_t packed = MPIDI_OFI_idata_get_gpu_packed_bit(wc->data); - uint32_t n_chunks = MPIDI_OFI_idata_get_gpuchunk_bits(wc->data); - if (likely(packed == 0)) { - if (wc->len > 0) { - MPIR_Assert(n_chunks == 0); - /* First chunk arrives. */ - mpi_errno = MPIDI_OFI_gpu_pipeline_recv_copy(rreq, wc_buf, wc->len, - recv_buf, recv_count, datatype); - MPIR_ERR_CHECK(mpi_errno); - } else { - /* free this chunk */ - MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, wc_buf); - MPIR_Assert(n_chunks > 0); - /* Post recv for remaining chunks. */ - MPIR_cc_dec(rreq->cc_ptr); - for (i = 0; i < n_chunks; i++) { - mpi_errno = MPIDI_OFI_gpu_pipeline_recv(rreq, i, n_chunks); - MPIR_ERR_CHECK(mpi_errno); - } - } - } else { - MPIR_ERR_CHKANDJUMP(true, mpi_errno, MPI_ERR_OTHER, "**gpu_pipeline_packed"); - } - } else { - if (likely(event_id == MPIDI_OFI_EVENT_RECV_GPU_PIPELINE)) { - mpi_errno = MPIDI_OFI_gpu_pipeline_recv_copy(rreq, wc_buf, wc->len, - recv_buf, recv_count, datatype); - MPIR_ERR_CHECK(mpi_errno); - } else { - MPIR_ERR_CHKANDJUMP(true, mpi_errno, MPI_ERR_OTHER, "**gpu_pipeline_packed"); - } - } - fn_exit: - MPIR_FUNC_EXIT; - return mpi_errno; - fn_fail: - rreq->status.MPI_ERROR = mpi_errno; - goto fn_exit; -} - static int send_huge_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * sreq) { int mpi_errno = MPI_SUCCESS; @@ -567,13 +471,13 @@ int MPIDI_OFI_dispatch_function(int vci, struct fi_cq_tagged_entry *wc, MPIR_Req mpi_errno = am_read_event(vci, wc, req); goto fn_exit; } else if (MPIDI_OFI_REQUEST(req, event_id) == MPIDI_OFI_EVENT_SEND_GPU_PIPELINE) { - mpi_errno = pipeline_send_event(wc, req); + mpi_errno = MPIDI_OFI_gpu_pipeline_send_event(wc, req); goto fn_exit; } else if (MPIDI_OFI_REQUEST(req, event_id) == MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT) { - mpi_errno = pipeline_recv_event(wc, req, MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT); + mpi_errno = MPIDI_OFI_gpu_pipeline_recv_event(wc, req); goto fn_exit; } else if (MPIDI_OFI_REQUEST(req, event_id) == MPIDI_OFI_EVENT_RECV_GPU_PIPELINE) { - mpi_errno = pipeline_recv_event(wc, req, MPIDI_OFI_EVENT_RECV_GPU_PIPELINE); + mpi_errno = MPIDI_OFI_gpu_pipeline_recv_event(wc, req); goto fn_exit; } else if (unlikely(1)) { switch (MPIDI_OFI_REQUEST(req, event_id)) { diff --git a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c index 5b79bdd3394..ae0f14dcc1e 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c +++ b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c @@ -6,6 +6,19 @@ #include "mpidimpl.h" #include "mpir_async_things.h" +struct chunk_req { + char pad[MPIDI_REQUEST_HDR_SIZE]; + struct fi_context context[MPIDI_OFI_CONTEXT_STRUCTS]; /* fixed field, do not move */ + int event_id; /* fixed field, do not move */ + MPIR_Request *parent; /* Parent request */ + void *buf; +}; + +static void spawn_send_copy(MPIR_Async_thing * thing, MPIR_Request * sreq, MPIR_async_req * areq, + const void *buf, MPI_Aint chunk_sz); +static int start_recv_copy(MPIR_Request * rreq, void *buf, MPI_Aint chunk_sz, + void *recv_buf, MPI_Aint count, MPI_Datatype datatype); + /* ------------------------------------ * send_alloc: allocate send chunks and start copy, may postpone as async */ @@ -20,8 +33,6 @@ struct send_alloc { }; static int send_alloc_poll(MPIR_Async_thing * thing); -static void spawn_send_copy(MPIR_Async_thing * thing, MPIR_Request * sreq, MPIR_async_req * areq, - const void *buf, MPI_Aint chunk_sz); int MPIDI_OFI_gpu_pipeline_send(MPIR_Request * sreq, const void *send_buf, MPI_Aint count, MPI_Datatype datatype, @@ -144,8 +155,7 @@ static void send_copy_complete(MPIR_Request * sreq, const void *buf, MPI_Aint ch int mpi_errno = MPI_SUCCESS; int vci_local = MPIDI_OFI_REQUEST(sreq, pipeline_info.vci_local); - MPIDI_OFI_gpu_pipeline_request *chunk_req = (MPIDI_OFI_gpu_pipeline_request *) - MPL_malloc(sizeof(MPIDI_OFI_gpu_pipeline_request), MPL_MEM_BUFFER); + struct chunk_req *chunk_req = MPL_malloc(sizeof(struct chunk_req), MPL_MEM_BUFFER); MPIR_Assertp(chunk_req); chunk_req->parent = sreq; @@ -170,12 +180,36 @@ static void send_copy_complete(MPIR_Request * sreq, const void *buf, MPI_Aint ch MPIR_Assert(0); } +/* ------------------------------------ + * send_event: callback for MPIDI_OFI_dispatch_function in ofi_events.c + */ +int MPIDI_OFI_gpu_pipeline_send_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r) +{ + int mpi_errno = MPI_SUCCESS; + + struct chunk_req *chunk_req = (void *) r; + MPIR_Request *sreq = chunk_req->parent;; + void *host_buf = chunk_req->buf; + MPL_free(chunk_req); + + MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.gpu_pipeline_send_pool, host_buf); + + int c; + MPIR_cc_decr(sreq->cc_ptr, &c); + if (c == 0) { + MPIR_Datatype_release_if_not_builtin(MPIDI_OFI_REQUEST(sreq, datatype)); + MPIR_Request_free(sreq); + } + + return mpi_errno; +} + /* ------------------------------------ * recv_alloc: allocate recv chunk buffer and post fi_trecv */ struct recv_alloc { MPIR_Request *rreq; - MPIDI_OFI_gpu_pipeline_request *chunk_req; + struct chunk_req *chunk_req; int idx; int n_chunks; }; @@ -220,7 +254,7 @@ static int recv_alloc_poll(MPIR_Async_thing * thing) uint64_t match_bits = MPIDI_OFI_REQUEST(rreq, pipeline_info.match_bits); uint64_t mask_bits = MPIDI_OFI_REQUEST(rreq, pipeline_info.mask_bits); - MPIDI_OFI_gpu_pipeline_request *chunk_req; + struct chunk_req *chunk_req; chunk_req = MPL_malloc(sizeof(*chunk_req), MPL_MEM_BUFFER); MPIR_Assert(chunk_req); @@ -251,6 +285,66 @@ static int recv_alloc_poll(MPIR_Async_thing * thing) return MPIR_ASYNC_THING_NOPROGRESS; }; +/* ------------------------------------ + * recv_event: callback for MPIDI_OFI_dispatch_function in ofi_events.c + */ +int MPIDI_OFI_gpu_pipeline_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r) +{ + int mpi_errno = MPI_SUCCESS; + + struct chunk_req *chunk_req = (void *) r; + int event_id = chunk_req->event_id; + MPIR_Request *rreq = chunk_req->parent; + void *host_buf = chunk_req->buf; + + MPL_free(chunk_req); + + void *recv_buf = MPIDI_OFI_REQUEST(rreq, noncontig.pack.buf); + size_t recv_count = MPIDI_OFI_REQUEST(rreq, noncontig.pack.count); + MPI_Datatype datatype = MPIDI_OFI_REQUEST(rreq, noncontig.pack.datatype); + + if (event_id == MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT) { + rreq->status.MPI_SOURCE = MPIDI_OFI_cqe_get_source(wc, true); + rreq->status.MPI_ERROR = MPIDI_OFI_idata_get_error_bits(wc->data); + rreq->status.MPI_TAG = MPIDI_OFI_init_get_tag(wc->tag); + + if (unlikely(MPIDI_OFI_is_tag_sync(wc->tag))) { + MPIDI_OFI_REQUEST(rreq, pipeline_info.is_sync) = true; + } + + uint32_t packed = MPIDI_OFI_idata_get_gpu_packed_bit(wc->data); + uint32_t n_chunks = MPIDI_OFI_idata_get_gpuchunk_bits(wc->data); + /* ? - Not sure why sender cannot send packed data */ + MPIR_Assertp(packed == 0); + if (wc->len > 0) { + /* message from a normal send */ + MPIR_Assert(n_chunks == 0); + mpi_errno = start_recv_copy(rreq, host_buf, wc->len, recv_buf, recv_count, datatype); + MPIR_ERR_CHECK(mpi_errno); + } else { + MPIR_Assert(n_chunks > 0); + /* There is no data in the init chunk, free the buffer */ + MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, host_buf); + MPIR_cc_dec(rreq->cc_ptr); + /* Post recv for the remaining chunks. */ + for (int i = 0; i < n_chunks; i++) { + mpi_errno = MPIDI_OFI_gpu_pipeline_recv(rreq, i, n_chunks); + MPIR_ERR_CHECK(mpi_errno); + } + } + } else { + MPIR_Assert(event_id == MPIDI_OFI_EVENT_RECV_GPU_PIPELINE); + mpi_errno = start_recv_copy(rreq, host_buf, wc->len, recv_buf, recv_count, datatype); + MPIR_ERR_CHECK(mpi_errno); + } + + fn_exit: + return mpi_errno; + fn_fail: + rreq->status.MPI_ERROR = mpi_errno; + goto fn_exit; +} + /* ------------------------------------ * recv_copy: async copy from host_buf to user buffer in recv event */ @@ -265,8 +359,8 @@ struct recv_copy { static int recv_copy_poll(MPIR_Async_thing * thing); static void recv_copy_complete(MPIR_Request * rreq, void *buf); -int MPIDI_OFI_gpu_pipeline_recv_copy(MPIR_Request * rreq, void *buf, MPI_Aint chunk_sz, - void *recv_buf, MPI_Aint count, MPI_Datatype datatype) +static int start_recv_copy(MPIR_Request * rreq, void *buf, MPI_Aint chunk_sz, + void *recv_buf, MPI_Aint count, MPI_Datatype datatype) { int mpi_errno = MPI_SUCCESS; diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h index 66478af9abe..50280106362 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_impl.h +++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h @@ -831,8 +831,8 @@ int MPIDI_OFI_gpu_pipeline_send(MPIR_Request * sreq, const void *send_buf, MPI_Aint count, MPI_Datatype datatype, MPL_pointer_attr_t attr, MPI_Aint data_sz); int MPIDI_OFI_gpu_pipeline_recv(MPIR_Request * rreq, int idx, int n_chunks); -int MPIDI_OFI_gpu_pipeline_recv_copy(MPIR_Request * rreq, void *buf, MPI_Aint chunk_sz, - void *recv_buf, MPI_Aint count, MPI_Datatype datatype); +int MPIDI_OFI_gpu_pipeline_send_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r); +int MPIDI_OFI_gpu_pipeline_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r); MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_pipeline_chunk_size(size_t data_sz) { diff --git a/src/mpid/ch4/netmod/ofi/ofi_types.h b/src/mpid/ch4/netmod/ofi/ofi_types.h index 0466c613e7f..46f49ea4478 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_types.h +++ b/src/mpid/ch4/netmod/ofi/ofi_types.h @@ -337,43 +337,6 @@ typedef struct { struct fid_cq *cq; } MPIDI_OFI_context_t; -/* GPU pipelining */ -typedef struct { - char pad[MPIDI_REQUEST_HDR_SIZE]; - struct fi_context context[MPIDI_OFI_CONTEXT_STRUCTS]; /* fixed field, do not move */ - int event_id; /* fixed field, do not move */ - MPIR_Request *parent; /* Parent request */ - void *buf; -} MPIDI_OFI_gpu_pipeline_request; - -typedef struct MPIDI_OFI_gpu_task { - MPIDI_OFI_pipeline_type_t type; - MPIDI_OFI_pipeline_status_t status; - void *buf; - size_t len; - MPIR_Request *request; - MPIR_async_req async_req; - struct MPIDI_OFI_gpu_task *next, *prev; -} MPIDI_OFI_gpu_task_t; - -typedef struct MPIDI_OFI_gpu_pending_recv { - MPIDI_OFI_gpu_pipeline_request *req; - int idx; - uint32_t n_chunks; - struct MPIDI_OFI_gpu_pending_recv *next, *prev; -} MPIDI_OFI_gpu_pending_recv_t; - -typedef struct MPIDI_OFI_gpu_pending_send { - MPIR_Request *sreq; - void *send_buf; - MPL_pointer_attr_t attr; - MPI_Aint offset; - uint32_t n_chunks; - MPI_Aint left_sz, count; - int dt_contig; - struct MPIDI_OFI_gpu_pending_send *next, *prev; -} MPIDI_OFI_gpu_pending_send_t; - typedef union { MPID_Thread_mutex_t m; char cacheline[MPL_CACHELINE_SIZE]; From 4baf414745508b3b397670d12f4e65751788e0cb Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Mon, 5 Feb 2024 11:41:51 -0600 Subject: [PATCH 10/17] ch4/ofi: move all gpu pipeline code into ofi_gpu_pipeline.c Move all gpu pipeline specific code into ofi_gpu_pipeline.c. Make a new function MPIDI_OFI_gpu_pipeline_recv that fills rreq with persistent pipeline_info data. Rename the original MPIDI_OFI_gpu_pipeline_recv into static function start_recv_chunk. --- src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c | 81 ++++++++++++++++++++-- src/mpid/ch4/netmod/ofi/ofi_impl.h | 19 +++-- src/mpid/ch4/netmod/ofi/ofi_recv.h | 19 +---- src/mpid/ch4/netmod/ofi/ofi_send.h | 40 ++--------- 4 files changed, 92 insertions(+), 67 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c index ae0f14dcc1e..dd8a66eaa0a 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c +++ b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c @@ -16,6 +16,7 @@ struct chunk_req { static void spawn_send_copy(MPIR_Async_thing * thing, MPIR_Request * sreq, MPIR_async_req * areq, const void *buf, MPI_Aint chunk_sz); +static int start_recv_chunk(MPIR_Request * rreq, int idx, int n_chunks); static int start_recv_copy(MPIR_Request * rreq, void *buf, MPI_Aint chunk_sz, void *recv_buf, MPI_Aint count, MPI_Datatype datatype); @@ -28,7 +29,7 @@ struct send_alloc { MPI_Aint count; MPI_Datatype datatype; MPL_pointer_attr_t attr; - MPI_Aint offset, left_sz; + MPI_Aint offset, left_sz, chunk_sz; int n_chunks; }; @@ -36,10 +37,39 @@ static int send_alloc_poll(MPIR_Async_thing * thing); int MPIDI_OFI_gpu_pipeline_send(MPIR_Request * sreq, const void *send_buf, MPI_Aint count, MPI_Datatype datatype, - MPL_pointer_attr_t attr, MPI_Aint data_sz) + MPL_pointer_attr_t attr, MPI_Aint data_sz, + uint64_t cq_data, fi_addr_t remote_addr, + int vci_local, int ctx_idx, uint64_t match_bits) { int mpi_errno = MPI_SUCCESS; + uint32_t n_chunks = 0; + uint64_t is_packed = 0; /* always 0 ? */ + MPI_Aint chunk_sz = MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ; + if (data_sz <= chunk_sz) { + /* data fits in a single chunk */ + chunk_sz = data_sz; + n_chunks = 1; + } else { + n_chunks = data_sz / chunk_sz; + if (data_sz % chunk_sz > 0) { + n_chunks++; + } + } + MPIDI_OFI_idata_set_gpuchunk_bits(&cq_data, n_chunks); + MPIDI_OFI_idata_set_gpu_packed_bit(&cq_data, is_packed); + + MPIDI_OFI_REQUEST(sreq, pipeline_info.cq_data) = cq_data; + MPIDI_OFI_REQUEST(sreq, pipeline_info.remote_addr) = remote_addr; + MPIDI_OFI_REQUEST(sreq, pipeline_info.vci_local) = vci_local; + MPIDI_OFI_REQUEST(sreq, pipeline_info.ctx_idx) = ctx_idx; + MPIDI_OFI_REQUEST(sreq, pipeline_info.match_bits) = match_bits; + MPIDI_OFI_REQUEST(sreq, pipeline_info.data_sz) = data_sz; + + /* Send the initial empty packet for matching */ + MPIDI_OFI_CALL_RETRY(fi_tinjectdata(MPIDI_OFI_global.ctx[ctx_idx].tx, NULL, 0, cq_data, + remote_addr, match_bits), vci_local, tinjectdata); + struct send_alloc *p; p = MPL_malloc(sizeof(*p), MPL_MEM_OTHER); MPIR_Assert(p); @@ -50,12 +80,17 @@ int MPIDI_OFI_gpu_pipeline_send(MPIR_Request * sreq, const void *send_buf, p->datatype = datatype; p->attr = attr; p->left_sz = data_sz; + p->chunk_sz = chunk_sz; p->offset = 0; p->n_chunks = 0; mpi_errno = MPIR_Async_things_add(send_alloc_poll, p); + /* TODO: kick the progress right away */ + fn_exit: return mpi_errno; + fn_fail: + goto fn_exit; } static int send_alloc_poll(MPIR_Async_thing * thing) @@ -70,7 +105,7 @@ static int send_alloc_poll(MPIR_Async_thing * thing) return (num_new_chunks == 0) ? MPIR_ASYNC_THING_NOPROGRESS : MPIR_ASYNC_THING_UPDATED; } MPIR_async_req async_req; - MPI_Aint chunk_sz = MPL_MIN(p->left_sz, MPIDI_OFI_REQUEST(p->sreq, pipeline_info.chunk_sz)); + MPI_Aint chunk_sz = MPL_MIN(p->left_sz, p->chunk_sz); MPL_gpu_engine_type_t engine_type = MPIDI_OFI_gpu_get_send_engine_type(MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE); int commit = p->left_sz <= chunk_sz ? 1 : 0; @@ -216,7 +251,43 @@ struct recv_alloc { static int recv_alloc_poll(MPIR_Async_thing * thing); -int MPIDI_OFI_gpu_pipeline_recv(MPIR_Request * rreq, int idx, int n_chunks) +int MPIDI_OFI_gpu_pipeline_recv(MPIR_Request * rreq, + void *recv_buf, MPI_Aint count, MPI_Datatype datatype, + fi_addr_t remote_addr, int vci_local, + uint64_t match_bits, uint64_t mask_bits, + MPI_Aint data_sz, int ctx_idx) +{ + int mpi_errno = MPI_SUCCESS; + + MPIDI_OFI_REQUEST(rreq, pipeline_info.offset) = 0; + MPIDI_OFI_REQUEST(rreq, pipeline_info.is_sync) = false; + MPIDI_OFI_REQUEST(rreq, pipeline_info.remote_addr) = remote_addr; + MPIDI_OFI_REQUEST(rreq, pipeline_info.vci_local) = vci_local; + MPIDI_OFI_REQUEST(rreq, pipeline_info.match_bits) = match_bits; + MPIDI_OFI_REQUEST(rreq, pipeline_info.mask_bits) = mask_bits; + MPIDI_OFI_REQUEST(rreq, pipeline_info.data_sz) = data_sz; + MPIDI_OFI_REQUEST(rreq, pipeline_info.ctx_idx) = ctx_idx; + + /* Save original buf, datatype and count */ + MPIDI_OFI_REQUEST(rreq, noncontig.pack.buf) = recv_buf; + MPIDI_OFI_REQUEST(rreq, noncontig.pack.count) = count; + MPIDI_OFI_REQUEST(rreq, noncontig.pack.datatype) = datatype; + + struct recv_alloc *p; + p = MPL_malloc(sizeof(*p), MPL_MEM_OTHER); + MPIR_Assert(p); + + p->rreq = rreq; + p->idx = 0; + p->n_chunks = -1; /* it's MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT */ + + mpi_errno = MPIR_Async_things_add(recv_alloc_poll, p); + + return mpi_errno; +} + +/* this is called from recv_event */ +static int start_recv_chunk(MPIR_Request * rreq, int idx, int n_chunks) { int mpi_errno = MPI_SUCCESS; @@ -328,7 +399,7 @@ int MPIDI_OFI_gpu_pipeline_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Reques MPIR_cc_dec(rreq->cc_ptr); /* Post recv for the remaining chunks. */ for (int i = 0; i < n_chunks; i++) { - mpi_errno = MPIDI_OFI_gpu_pipeline_recv(rreq, i, n_chunks); + mpi_errno = start_recv_chunk(rreq, i, n_chunks); MPIR_ERR_CHECK(mpi_errno); } } diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h index 50280106362..b4c959b8f35 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_impl.h +++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h @@ -829,18 +829,15 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_free_pack_buffer(void *ptr) int MPIDI_OFI_gpu_pipeline_send(MPIR_Request * sreq, const void *send_buf, MPI_Aint count, MPI_Datatype datatype, - MPL_pointer_attr_t attr, MPI_Aint data_sz); -int MPIDI_OFI_gpu_pipeline_recv(MPIR_Request * rreq, int idx, int n_chunks); + MPL_pointer_attr_t attr, MPI_Aint data_sz, + uint64_t cq_data, fi_addr_t remote_addr, + int vci_local, int ctx_idx, uint64_t match_bits); +int MPIDI_OFI_gpu_pipeline_recv(MPIR_Request * rreq, + void *recv_buf, MPI_Aint count, MPI_Datatype datatype, + fi_addr_t remote_addr, int vci_local, + uint64_t match_bits, uint64_t mask_bits, + MPI_Aint data_sz, int ctx_idx); int MPIDI_OFI_gpu_pipeline_send_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r); int MPIDI_OFI_gpu_pipeline_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Request * r); -MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_pipeline_chunk_size(size_t data_sz) -{ - int chunk_size = MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ; - if (data_sz <= MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ) { - chunk_size = data_sz; - } - return chunk_size; -} - #endif /* OFI_IMPL_H_INCLUDED */ diff --git a/src/mpid/ch4/netmod/ofi/ofi_recv.h b/src/mpid/ch4/netmod/ofi/ofi_recv.h index 9beebd7f579..74ed3323bc3 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_recv.h +++ b/src/mpid/ch4/netmod/ofi/ofi_recv.h @@ -233,27 +233,14 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_irecv(void *buf, remote_addr = MPIDI_OFI_av_to_phys(addr, sender_nic, vci_remote); } - /* Save pipeline information. */ - MPIDI_OFI_REQUEST(rreq, pipeline_info.offset) = 0; - MPIDI_OFI_REQUEST(rreq, pipeline_info.is_sync) = false; - MPIDI_OFI_REQUEST(rreq, pipeline_info.remote_addr) = remote_addr; - MPIDI_OFI_REQUEST(rreq, pipeline_info.vci_local) = vci_local; - MPIDI_OFI_REQUEST(rreq, pipeline_info.match_bits) = match_bits; - MPIDI_OFI_REQUEST(rreq, pipeline_info.mask_bits) = mask_bits; - MPIDI_OFI_REQUEST(rreq, pipeline_info.data_sz) = data_sz; - MPIDI_OFI_REQUEST(rreq, pipeline_info.ctx_idx) = ctx_idx; - - /* Save original buf, datatype and count */ - MPIDI_OFI_REQUEST(rreq, noncontig.pack.buf) = buf; - MPIDI_OFI_REQUEST(rreq, noncontig.pack.count) = count; - MPIDI_OFI_REQUEST(rreq, noncontig.pack.datatype) = datatype; - if (rreq->comm == NULL) { rreq->comm = comm; MPIR_Comm_add_ref(comm); } - mpi_errno = MPIDI_OFI_gpu_pipeline_recv(rreq, 0, -1); + mpi_errno = MPIDI_OFI_gpu_pipeline_recv(rreq, buf, count, datatype, + remote_addr, vci_local, + match_bits, mask_bits, data_sz, ctx_idx); goto fn_exit; } diff --git a/src/mpid/ch4/netmod/ofi/ofi_send.h b/src/mpid/ch4/netmod/ofi/ofi_send.h index 9d72d3f5f0d..855970e8927 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_send.h +++ b/src/mpid/ch4/netmod/ofi/ofi_send.h @@ -274,43 +274,13 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_normal(const void *buf, MPI_Aint cou if (force_gpu_pack && MPIR_CVAR_CH4_OFI_ENABLE_GPU_PIPELINE && data_sz >= MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD) { /* Pipeline path */ - uint32_t n_chunks = 0; - int chunk_size = MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ; - if (dt_contig) { - /* Update correct number of chunks in immediate data. */ - chunk_size = MPIDI_OFI_gpu_pipeline_chunk_size(data_sz); - n_chunks = data_sz / chunk_size; - if (data_sz % chunk_size) - n_chunks++; - MPIDI_OFI_idata_set_gpuchunk_bits(&cq_data, n_chunks); - } + fi_addr_t remote_addr = MPIDI_OFI_av_to_phys(addr, receiver_nic, vci_remote); + mpi_errno = MPIDI_OFI_gpu_pipeline_send(sreq, buf, count, datatype, attr, data_sz, + cq_data, remote_addr, vci_local, ctx_idx, + match_bits); + MPIR_ERR_CHECK(mpi_errno); - /* Update sender packed bit if necessary. */ - uint64_t is_packed = datatype == MPI_PACKED ? 1 : 0; - MPIDI_OFI_idata_set_gpu_packed_bit(&cq_data, is_packed); - MPIR_ERR_CHKANDJUMP(is_packed, mpi_errno, MPI_ERR_OTHER, "**gpu_pipeline_packed"); - - /* Save pipeline information. */ - MPIDI_OFI_REQUEST(sreq, pipeline_info.chunk_sz) = chunk_size; - MPIDI_OFI_REQUEST(sreq, pipeline_info.cq_data) = cq_data; - MPIDI_OFI_REQUEST(sreq, pipeline_info.remote_addr) = - MPIDI_OFI_av_to_phys(addr, receiver_nic, vci_remote); - MPIDI_OFI_REQUEST(sreq, pipeline_info.vci_local) = vci_local; - MPIDI_OFI_REQUEST(sreq, pipeline_info.ctx_idx) = ctx_idx; - MPIDI_OFI_REQUEST(sreq, pipeline_info.match_bits) = match_bits; - MPIDI_OFI_REQUEST(sreq, pipeline_info.data_sz) = data_sz; - - /* send an empty message for tag matching */ - MPIDI_OFI_CALL_RETRY(fi_tinjectdata(MPIDI_OFI_global.ctx[ctx_idx].tx, - NULL, - 0, - cq_data, - MPIDI_OFI_REQUEST(sreq, pipeline_info.remote_addr), - match_bits), vci_local, tinjectdata); MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_sent_bytes_count[sender_nic], data_sz); - - MPIDI_OFI_gpu_pipeline_send(sreq, buf, count, datatype, attr, data_sz); - goto fn_exit; } From 4ed79096fc5e3e44cff3c918c4200b65e3bc8f9d Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Mon, 5 Feb 2024 12:37:03 -0600 Subject: [PATCH 11/17] ch4/ofi: refactor pipeline_info into a union Make the code cleaner to separate the pipeline_info type into a union of send and recv. --- src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c | 60 ++++++++++------------ src/mpid/ch4/netmod/ofi/ofi_pre.h | 31 ++++++----- 2 files changed, 45 insertions(+), 46 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c index dd8a66eaa0a..707af00b33e 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c +++ b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c @@ -59,12 +59,11 @@ int MPIDI_OFI_gpu_pipeline_send(MPIR_Request * sreq, const void *send_buf, MPIDI_OFI_idata_set_gpuchunk_bits(&cq_data, n_chunks); MPIDI_OFI_idata_set_gpu_packed_bit(&cq_data, is_packed); - MPIDI_OFI_REQUEST(sreq, pipeline_info.cq_data) = cq_data; - MPIDI_OFI_REQUEST(sreq, pipeline_info.remote_addr) = remote_addr; - MPIDI_OFI_REQUEST(sreq, pipeline_info.vci_local) = vci_local; - MPIDI_OFI_REQUEST(sreq, pipeline_info.ctx_idx) = ctx_idx; - MPIDI_OFI_REQUEST(sreq, pipeline_info.match_bits) = match_bits; - MPIDI_OFI_REQUEST(sreq, pipeline_info.data_sz) = data_sz; + MPIDI_OFI_REQUEST(sreq, pipeline_info.send.cq_data) = cq_data; + MPIDI_OFI_REQUEST(sreq, pipeline_info.send.remote_addr) = remote_addr; + MPIDI_OFI_REQUEST(sreq, pipeline_info.send.vci_local) = vci_local; + MPIDI_OFI_REQUEST(sreq, pipeline_info.send.ctx_idx) = ctx_idx; + MPIDI_OFI_REQUEST(sreq, pipeline_info.send.match_bits) = match_bits; /* Send the initial empty packet for matching */ MPIDI_OFI_CALL_RETRY(fi_tinjectdata(MPIDI_OFI_global.ctx[ctx_idx].tx, NULL, 0, cq_data, @@ -188,7 +187,7 @@ static int send_copy_poll(MPIR_Async_thing * thing) static void send_copy_complete(MPIR_Request * sreq, const void *buf, MPI_Aint chunk_sz) { int mpi_errno = MPI_SUCCESS; - int vci_local = MPIDI_OFI_REQUEST(sreq, pipeline_info.vci_local); + int vci_local = MPIDI_OFI_REQUEST(sreq, pipeline_info.send.vci_local); struct chunk_req *chunk_req = MPL_malloc(sizeof(struct chunk_req), MPL_MEM_BUFFER); MPIR_Assertp(chunk_req); @@ -197,11 +196,11 @@ static void send_copy_complete(MPIR_Request * sreq, const void *buf, MPI_Aint ch chunk_req->event_id = MPIDI_OFI_EVENT_SEND_GPU_PIPELINE; chunk_req->buf = (void *) buf; - int ctx_idx = MPIDI_OFI_REQUEST(sreq, pipeline_info.ctx_idx); - fi_addr_t remote_addr = MPIDI_OFI_REQUEST(sreq, pipeline_info.remote_addr); - uint64_t cq_data = MPIDI_OFI_REQUEST(sreq, pipeline_info.cq_data); - uint64_t match_bits = MPIDI_OFI_REQUEST(sreq, pipeline_info.match_bits); - match_bits |= MPIDI_OFI_GPU_PIPELINE_SEND; + int ctx_idx = MPIDI_OFI_REQUEST(sreq, pipeline_info.send.ctx_idx); + fi_addr_t remote_addr = MPIDI_OFI_REQUEST(sreq, pipeline_info.send.remote_addr); + uint64_t cq_data = MPIDI_OFI_REQUEST(sreq, pipeline_info.send.cq_data); + uint64_t match_bits = MPIDI_OFI_REQUEST(sreq, pipeline_info.send.match_bits) | + MPIDI_OFI_GPU_PIPELINE_SEND; MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(vci_local).lock); MPIDI_OFI_CALL_RETRY(fi_tsenddata(MPIDI_OFI_global.ctx[ctx_idx].tx, buf, chunk_sz, NULL /* desc */ , @@ -245,7 +244,6 @@ int MPIDI_OFI_gpu_pipeline_send_event(struct fi_cq_tagged_entry *wc, MPIR_Reques struct recv_alloc { MPIR_Request *rreq; struct chunk_req *chunk_req; - int idx; int n_chunks; }; @@ -259,14 +257,14 @@ int MPIDI_OFI_gpu_pipeline_recv(MPIR_Request * rreq, { int mpi_errno = MPI_SUCCESS; - MPIDI_OFI_REQUEST(rreq, pipeline_info.offset) = 0; - MPIDI_OFI_REQUEST(rreq, pipeline_info.is_sync) = false; - MPIDI_OFI_REQUEST(rreq, pipeline_info.remote_addr) = remote_addr; - MPIDI_OFI_REQUEST(rreq, pipeline_info.vci_local) = vci_local; - MPIDI_OFI_REQUEST(rreq, pipeline_info.match_bits) = match_bits; - MPIDI_OFI_REQUEST(rreq, pipeline_info.mask_bits) = mask_bits; - MPIDI_OFI_REQUEST(rreq, pipeline_info.data_sz) = data_sz; - MPIDI_OFI_REQUEST(rreq, pipeline_info.ctx_idx) = ctx_idx; + /* The 1st recv is an empty chunk for matching. We need initialize rreq. */ + MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.offset) = 0; + MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.is_sync) = false; + MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.remote_addr) = remote_addr; + MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.vci_local) = vci_local; + MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.match_bits) = match_bits; + MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.mask_bits) = mask_bits; + MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.ctx_idx) = ctx_idx; /* Save original buf, datatype and count */ MPIDI_OFI_REQUEST(rreq, noncontig.pack.buf) = recv_buf; @@ -278,7 +276,6 @@ int MPIDI_OFI_gpu_pipeline_recv(MPIR_Request * rreq, MPIR_Assert(p); p->rreq = rreq; - p->idx = 0; p->n_chunks = -1; /* it's MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT */ mpi_errno = MPIR_Async_things_add(recv_alloc_poll, p); @@ -296,7 +293,6 @@ static int start_recv_chunk(MPIR_Request * rreq, int idx, int n_chunks) MPIR_Assert(p); p->rreq = rreq; - p->idx = idx; p->n_chunks = n_chunks; mpi_errno = MPIR_Async_things_add(recv_alloc_poll, p); @@ -319,11 +315,11 @@ static int recv_alloc_poll(MPIR_Async_thing * thing) return MPIR_ASYNC_THING_NOPROGRESS; } - fi_addr_t remote_addr = MPIDI_OFI_REQUEST(rreq, pipeline_info.remote_addr); - int ctx_idx = MPIDI_OFI_REQUEST(rreq, pipeline_info.ctx_idx); + fi_addr_t remote_addr = MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.remote_addr); + int ctx_idx = MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.ctx_idx); int vci = MPIDI_Request_get_vci(rreq); - uint64_t match_bits = MPIDI_OFI_REQUEST(rreq, pipeline_info.match_bits); - uint64_t mask_bits = MPIDI_OFI_REQUEST(rreq, pipeline_info.mask_bits); + uint64_t match_bits = MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.match_bits); + uint64_t mask_bits = MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.mask_bits); struct chunk_req *chunk_req; chunk_req = MPL_malloc(sizeof(*chunk_req), MPL_MEM_BUFFER); @@ -380,7 +376,7 @@ int MPIDI_OFI_gpu_pipeline_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Reques rreq->status.MPI_TAG = MPIDI_OFI_init_get_tag(wc->tag); if (unlikely(MPIDI_OFI_is_tag_sync(wc->tag))) { - MPIDI_OFI_REQUEST(rreq, pipeline_info.is_sync) = true; + MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.is_sync) = true; } uint32_t packed = MPIDI_OFI_idata_get_gpu_packed_bit(wc->data); @@ -435,7 +431,7 @@ static int start_recv_copy(MPIR_Request * rreq, void *buf, MPI_Aint chunk_sz, { int mpi_errno = MPI_SUCCESS; - MPI_Aint offset = MPIDI_OFI_REQUEST(rreq, pipeline_info.offset); + MPI_Aint offset = MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.offset); int engine_type = MPIR_CVAR_CH4_OFI_GPU_PIPELINE_H2D_ENGINE_TYPE; /* FIXME: current design unpacks all bytes from host buffer, overflow check is missing. */ @@ -445,7 +441,7 @@ static int start_recv_copy(MPIR_Request * rreq, void *buf, MPI_Aint chunk_sz, MPL_GPU_COPY_H2D, engine_type, 1, &async_req); MPIR_ERR_CHECK(mpi_errno); - MPIDI_OFI_REQUEST(rreq, pipeline_info.offset) += chunk_sz; + MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.offset) += chunk_sz; struct recv_copy *p; p = MPL_malloc(sizeof(*p), MPL_MEM_OTHER); @@ -486,7 +482,7 @@ static void recv_copy_complete(MPIR_Request * rreq, void *buf) MPIR_cc_decr(rreq->cc_ptr, &c); if (c == 0) { /* all chunks arrived and copied */ - if (unlikely(MPIDI_OFI_REQUEST(rreq, pipeline_info.is_sync))) { + if (unlikely(MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.is_sync))) { MPIR_Comm *comm = rreq->comm; uint64_t ss_bits = MPIDI_OFI_init_sendtag(MPL_atomic_relaxed_load_int @@ -513,7 +509,7 @@ static void recv_copy_complete(MPIR_Request * rreq, void *buf) MPIR_Datatype_release_if_not_builtin(MPIDI_OFI_REQUEST(rreq, datatype)); /* Set number of bytes in status. */ - MPIR_STATUS_SET_COUNT(rreq->status, MPIDI_OFI_REQUEST(rreq, pipeline_info.offset)); + MPIR_STATUS_SET_COUNT(rreq->status, MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.offset)); MPIR_Request_free(rreq); } diff --git a/src/mpid/ch4/netmod/ofi/ofi_pre.h b/src/mpid/ch4/netmod/ofi/ofi_pre.h index 8ccdb44a499..bfc9ea5b53c 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_pre.h +++ b/src/mpid/ch4/netmod/ofi/ofi_pre.h @@ -216,20 +216,23 @@ typedef struct { struct iovec iov; void *inject_buf; /* Internal buffer for inject emulation */ } util; - struct { - fi_addr_t remote_addr; - int ctx_idx; - int vci_local; - int chunk_sz; - bool is_sync; - uint64_t cq_data; - uint64_t match_bits; - uint64_t mask_bits; - size_t offset; - size_t data_sz; - char *pack_recv_buf; - void *usm_host_buf; /* recv */ - MPIR_Request *req; + union { + struct { + int vci_local; + int ctx_idx; + fi_addr_t remote_addr; + uint64_t cq_data; + uint64_t match_bits; + } send; + struct { + int vci_local; + int ctx_idx; + fi_addr_t remote_addr; + uint64_t match_bits; + uint64_t mask_bits; + MPI_Aint offset; + bool is_sync; + } recv; } pipeline_info; /* GPU pipeline */ } MPIDI_OFI_request_t; From 52b93ad76f2697bc90204951c68d3f5d275e01e4 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Mon, 5 Feb 2024 21:32:13 -0600 Subject: [PATCH 12/17] ch4/ofi: use explicit counters to track gpu pipeline Don't mix the usage of cc_ptr, use separate and explicit counters to track the progress and completion of chunks. --- src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c | 30 ++++++++++++---------- src/mpid/ch4/netmod/ofi/ofi_pre.h | 3 +++ 2 files changed, 19 insertions(+), 14 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c index 707af00b33e..2f65331ebc1 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c +++ b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c @@ -59,6 +59,7 @@ int MPIDI_OFI_gpu_pipeline_send(MPIR_Request * sreq, const void *send_buf, MPIDI_OFI_idata_set_gpuchunk_bits(&cq_data, n_chunks); MPIDI_OFI_idata_set_gpu_packed_bit(&cq_data, is_packed); + MPIDI_OFI_REQUEST(sreq, pipeline_info.send.num_remain) = n_chunks; MPIDI_OFI_REQUEST(sreq, pipeline_info.send.cq_data) = cq_data; MPIDI_OFI_REQUEST(sreq, pipeline_info.send.remote_addr) = remote_addr; MPIDI_OFI_REQUEST(sreq, pipeline_info.send.vci_local) = vci_local; @@ -123,9 +124,6 @@ static int send_alloc_poll(MPIR_Async_thing * thing) p->offset += (size_t) chunk_sz; p->left_sz -= (size_t) chunk_sz; p->n_chunks++; - /* Increase request completion cnt, cc is 1 more than necessary - * to prevent parent request being freed prematurally. */ - MPIR_cc_inc(p->sreq->cc_ptr); spawn_send_copy(thing, p->sreq, &async_req, host_buf, chunk_sz); @@ -228,11 +226,10 @@ int MPIDI_OFI_gpu_pipeline_send_event(struct fi_cq_tagged_entry *wc, MPIR_Reques MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.gpu_pipeline_send_pool, host_buf); - int c; - MPIR_cc_decr(sreq->cc_ptr, &c); - if (c == 0) { + MPIDI_OFI_REQUEST(sreq, pipeline_info.send.num_remain) -= 1; + if (MPIDI_OFI_REQUEST(sreq, pipeline_info.send.num_remain) == 0) { MPIR_Datatype_release_if_not_builtin(MPIDI_OFI_REQUEST(sreq, datatype)); - MPIR_Request_free(sreq); + MPIDI_Request_complete_fast(sreq); } return mpi_errno; @@ -259,6 +256,8 @@ int MPIDI_OFI_gpu_pipeline_recv(MPIR_Request * rreq, /* The 1st recv is an empty chunk for matching. We need initialize rreq. */ MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.offset) = 0; + MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_inrecv) = 0; + MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_remain) = 0; MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.is_sync) = false; MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.remote_addr) = remote_addr; MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.vci_local) = vci_local; @@ -305,7 +304,8 @@ static int recv_alloc_poll(MPIR_Async_thing * thing) struct recv_alloc *p = MPIR_Async_thing_get_state(thing); MPIR_Request *rreq = p->rreq; - if (MPIR_cc_get(rreq->cc) > 1) { + /* arbitrary threshold */ + if (MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_inrecv) > 1) { return MPIR_ASYNC_THING_NOPROGRESS; } @@ -339,6 +339,7 @@ static int recv_alloc_poll(MPIR_Async_thing * thing) match_bits, mask_bits, (void *) &chunk_req->context); MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(vci).lock); if (ret == 0) { + MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_inrecv) += 1; MPL_free(p); /* chunk_req and host_buf will be freed in recv_events */ return MPIR_ASYNC_THING_DONE; @@ -382,17 +383,18 @@ int MPIDI_OFI_gpu_pipeline_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Reques uint32_t packed = MPIDI_OFI_idata_get_gpu_packed_bit(wc->data); uint32_t n_chunks = MPIDI_OFI_idata_get_gpuchunk_bits(wc->data); /* ? - Not sure why sender cannot send packed data */ - MPIR_Assertp(packed == 0); + MPIR_Assert(packed == 0); if (wc->len > 0) { /* message from a normal send */ MPIR_Assert(n_chunks == 0); + MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_remain) = 1; mpi_errno = start_recv_copy(rreq, host_buf, wc->len, recv_buf, recv_count, datatype); MPIR_ERR_CHECK(mpi_errno); } else { MPIR_Assert(n_chunks > 0); + MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_remain) = n_chunks; /* There is no data in the init chunk, free the buffer */ MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, host_buf); - MPIR_cc_dec(rreq->cc_ptr); /* Post recv for the remaining chunks. */ for (int i = 0; i < n_chunks; i++) { mpi_errno = start_recv_chunk(rreq, i, n_chunks); @@ -401,6 +403,7 @@ int MPIDI_OFI_gpu_pipeline_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Reques } } else { MPIR_Assert(event_id == MPIDI_OFI_EVENT_RECV_GPU_PIPELINE); + MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_inrecv) -= 1; mpi_errno = start_recv_copy(rreq, host_buf, wc->len, recv_buf, recv_count, datatype); MPIR_ERR_CHECK(mpi_errno); } @@ -478,9 +481,8 @@ static int recv_copy_poll(MPIR_Async_thing * thing) static void recv_copy_complete(MPIR_Request * rreq, void *buf) { int mpi_errno = MPI_SUCCESS; - int c; - MPIR_cc_decr(rreq->cc_ptr, &c); - if (c == 0) { + MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_remain) -= 1; + if (MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_remain) == 0) { /* all chunks arrived and copied */ if (unlikely(MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.is_sync))) { MPIR_Comm *comm = rreq->comm; @@ -511,7 +513,7 @@ static void recv_copy_complete(MPIR_Request * rreq, void *buf) /* Set number of bytes in status. */ MPIR_STATUS_SET_COUNT(rreq->status, MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.offset)); - MPIR_Request_free(rreq); + MPIDI_Request_complete_fast(rreq); } /* Free host buffer, yaksa request and task. */ diff --git a/src/mpid/ch4/netmod/ofi/ofi_pre.h b/src/mpid/ch4/netmod/ofi/ofi_pre.h index bfc9ea5b53c..07b999ca808 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_pre.h +++ b/src/mpid/ch4/netmod/ofi/ofi_pre.h @@ -223,6 +223,7 @@ typedef struct { fi_addr_t remote_addr; uint64_t cq_data; uint64_t match_bits; + int num_remain; } send; struct { int vci_local; @@ -231,6 +232,8 @@ typedef struct { uint64_t match_bits; uint64_t mask_bits; MPI_Aint offset; + int num_inrecv; + int num_remain; bool is_sync; } recv; } pipeline_info; /* GPU pipeline */ From 96988a1d0772510665c131ca89c75589b2df3aff Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Mon, 5 Feb 2024 22:27:16 -0600 Subject: [PATCH 13/17] ch4/ofi: use internal tag for pipeline chunk match_bits Follow a similar approach as nonblocking collectives, internal pipeline chunks use separate tag space (MPIDI_OFI_GPU_PIPELINE_SEND) and incrementing tags to avoid mismatch with regular messages. --- src/mpid/ch4/netmod/ofi/ofi_comm.c | 3 ++ src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c | 45 +++++++++++++--------- src/mpid/ch4/netmod/ofi/ofi_impl.h | 2 +- src/mpid/ch4/netmod/ofi/ofi_pre.h | 3 ++ src/mpid/ch4/netmod/ofi/ofi_send.h | 3 +- src/mpid/ch4/netmod/ofi/ofi_types.h | 30 +-------------- 6 files changed, 38 insertions(+), 48 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_comm.c b/src/mpid/ch4/netmod/ofi/ofi_comm.c index 57b9cb131de..8936941498a 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_comm.c +++ b/src/mpid/ch4/netmod/ofi/ofi_comm.c @@ -145,6 +145,9 @@ int MPIDI_OFI_mpi_comm_commit_pre_hook(MPIR_Comm * comm) MPIDI_OFI_COMM(comm).enable_hashing = 0; MPIDI_OFI_COMM(comm).pref_nic = NULL; + /* Initialize tag for gpu_pipeline chunks; incremented by sender. */ + MPIDI_OFI_COMM(comm).pipeline_tag = 0; + if (comm->hints[MPIR_COMM_HINT_ENABLE_MULTI_NIC_STRIPING] == -1) { comm->hints[MPIR_COMM_HINT_ENABLE_MULTI_NIC_STRIPING] = MPIR_CVAR_CH4_OFI_ENABLE_MULTI_NIC_STRIPING; diff --git a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c index 2f65331ebc1..3c4e17db02f 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c +++ b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c @@ -14,6 +14,11 @@ struct chunk_req { void *buf; }; +struct pipeline_header { + int n_chunks; + int pipeline_tag; +}; + static void spawn_send_copy(MPIR_Async_thing * thing, MPIR_Request * sreq, MPIR_async_req * areq, const void *buf, MPI_Aint chunk_sz); static int start_recv_chunk(MPIR_Request * rreq, int idx, int n_chunks); @@ -39,12 +44,11 @@ int MPIDI_OFI_gpu_pipeline_send(MPIR_Request * sreq, const void *send_buf, MPI_Aint count, MPI_Datatype datatype, MPL_pointer_attr_t attr, MPI_Aint data_sz, uint64_t cq_data, fi_addr_t remote_addr, - int vci_local, int ctx_idx, uint64_t match_bits) + int vci_local, int ctx_idx, uint64_t match_bits, int pipeline_tag) { int mpi_errno = MPI_SUCCESS; uint32_t n_chunks = 0; - uint64_t is_packed = 0; /* always 0 ? */ MPI_Aint chunk_sz = MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ; if (data_sz <= chunk_sz) { /* data fits in a single chunk */ @@ -56,8 +60,6 @@ int MPIDI_OFI_gpu_pipeline_send(MPIR_Request * sreq, const void *send_buf, n_chunks++; } } - MPIDI_OFI_idata_set_gpuchunk_bits(&cq_data, n_chunks); - MPIDI_OFI_idata_set_gpu_packed_bit(&cq_data, is_packed); MPIDI_OFI_REQUEST(sreq, pipeline_info.send.num_remain) = n_chunks; MPIDI_OFI_REQUEST(sreq, pipeline_info.send.cq_data) = cq_data; @@ -65,9 +67,15 @@ int MPIDI_OFI_gpu_pipeline_send(MPIR_Request * sreq, const void *send_buf, MPIDI_OFI_REQUEST(sreq, pipeline_info.send.vci_local) = vci_local; MPIDI_OFI_REQUEST(sreq, pipeline_info.send.ctx_idx) = ctx_idx; MPIDI_OFI_REQUEST(sreq, pipeline_info.send.match_bits) = match_bits; + MPIDI_OFI_REQUEST(sreq, pipeline_info.send.pipeline_tag) = pipeline_tag; + + struct pipeline_header hdr; + hdr.n_chunks = n_chunks; + hdr.pipeline_tag = pipeline_tag; /* Send the initial empty packet for matching */ - MPIDI_OFI_CALL_RETRY(fi_tinjectdata(MPIDI_OFI_global.ctx[ctx_idx].tx, NULL, 0, cq_data, + MPIDI_OFI_CALL_RETRY(fi_tinjectdata(MPIDI_OFI_global.ctx[ctx_idx].tx, + &hdr, sizeof(hdr), cq_data | MPIDI_OFI_IDATA_PIPELINE, remote_addr, match_bits), vci_local, tinjectdata); struct send_alloc *p; @@ -197,7 +205,7 @@ static void send_copy_complete(MPIR_Request * sreq, const void *buf, MPI_Aint ch int ctx_idx = MPIDI_OFI_REQUEST(sreq, pipeline_info.send.ctx_idx); fi_addr_t remote_addr = MPIDI_OFI_REQUEST(sreq, pipeline_info.send.remote_addr); uint64_t cq_data = MPIDI_OFI_REQUEST(sreq, pipeline_info.send.cq_data); - uint64_t match_bits = MPIDI_OFI_REQUEST(sreq, pipeline_info.send.match_bits) | + uint64_t match_bits = MPIDI_OFI_REQUEST(sreq, pipeline_info.send.pipeline_tag) | MPIDI_OFI_GPU_PIPELINE_SEND; MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(vci_local).lock); MPIDI_OFI_CALL_RETRY(fi_tsenddata(MPIDI_OFI_global.ctx[ctx_idx].tx, @@ -318,7 +326,6 @@ static int recv_alloc_poll(MPIR_Async_thing * thing) fi_addr_t remote_addr = MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.remote_addr); int ctx_idx = MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.ctx_idx); int vci = MPIDI_Request_get_vci(rreq); - uint64_t match_bits = MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.match_bits); uint64_t mask_bits = MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.mask_bits); struct chunk_req *chunk_req; @@ -327,10 +334,14 @@ static int recv_alloc_poll(MPIR_Async_thing * thing) chunk_req->parent = rreq; chunk_req->buf = host_buf; + + uint64_t match_bits; if (p->n_chunks == -1) { + match_bits = MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.match_bits); chunk_req->event_id = MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT; } else { - match_bits |= MPIDI_OFI_GPU_PIPELINE_SEND; + match_bits = MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.pipeline_tag) | + MPIDI_OFI_GPU_PIPELINE_SEND; chunk_req->event_id = MPIDI_OFI_EVENT_RECV_GPU_PIPELINE; } MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(vci).lock); @@ -380,24 +391,22 @@ int MPIDI_OFI_gpu_pipeline_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Reques MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.is_sync) = true; } - uint32_t packed = MPIDI_OFI_idata_get_gpu_packed_bit(wc->data); - uint32_t n_chunks = MPIDI_OFI_idata_get_gpuchunk_bits(wc->data); - /* ? - Not sure why sender cannot send packed data */ - MPIR_Assert(packed == 0); - if (wc->len > 0) { + bool is_pipeline = (wc->data & MPIDI_OFI_IDATA_PIPELINE); + if (!is_pipeline) { /* message from a normal send */ - MPIR_Assert(n_chunks == 0); MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_remain) = 1; mpi_errno = start_recv_copy(rreq, host_buf, wc->len, recv_buf, recv_count, datatype); MPIR_ERR_CHECK(mpi_errno); } else { - MPIR_Assert(n_chunks > 0); - MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_remain) = n_chunks; + struct pipeline_header *p_hdr = host_buf; + MPIR_Assert(p_hdr->n_chunks > 0); + MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_remain) = p_hdr->n_chunks; + MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.pipeline_tag) = p_hdr->pipeline_tag; /* There is no data in the init chunk, free the buffer */ MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, host_buf); /* Post recv for the remaining chunks. */ - for (int i = 0; i < n_chunks; i++) { - mpi_errno = start_recv_chunk(rreq, i, n_chunks); + for (int i = 0; i < p_hdr->n_chunks; i++) { + mpi_errno = start_recv_chunk(rreq, i, p_hdr->n_chunks); MPIR_ERR_CHECK(mpi_errno); } } diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h index b4c959b8f35..698d68fac8e 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_impl.h +++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h @@ -831,7 +831,7 @@ int MPIDI_OFI_gpu_pipeline_send(MPIR_Request * sreq, const void *send_buf, MPI_Aint count, MPI_Datatype datatype, MPL_pointer_attr_t attr, MPI_Aint data_sz, uint64_t cq_data, fi_addr_t remote_addr, - int vci_local, int ctx_idx, uint64_t match_bits); + int vci_local, int ctx_idx, uint64_t match_bits, int pipeline_tag); int MPIDI_OFI_gpu_pipeline_recv(MPIR_Request * rreq, void *recv_buf, MPI_Aint count, MPI_Datatype datatype, fi_addr_t remote_addr, int vci_local, diff --git a/src/mpid/ch4/netmod/ofi/ofi_pre.h b/src/mpid/ch4/netmod/ofi/ofi_pre.h index 07b999ca808..48beae17fe2 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_pre.h +++ b/src/mpid/ch4/netmod/ofi/ofi_pre.h @@ -48,6 +48,7 @@ typedef struct { int enable_striping; /* Flag to enable striping per communicator. */ int enable_hashing; /* Flag to enable hashing per communicator. */ int *pref_nic; /* Array to specify the preferred NIC for each rank (if needed) */ + int pipeline_tag; /* match_bits for gpu_pipeline chunks */ } MPIDI_OFI_comm_t; enum { MPIDI_AMTYPE_NONE = 0, @@ -223,6 +224,7 @@ typedef struct { fi_addr_t remote_addr; uint64_t cq_data; uint64_t match_bits; + int pipeline_tag; int num_remain; } send; struct { @@ -232,6 +234,7 @@ typedef struct { uint64_t match_bits; uint64_t mask_bits; MPI_Aint offset; + int pipeline_tag; int num_inrecv; int num_remain; bool is_sync; diff --git a/src/mpid/ch4/netmod/ofi/ofi_send.h b/src/mpid/ch4/netmod/ofi/ofi_send.h index 855970e8927..dc30330a84c 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_send.h +++ b/src/mpid/ch4/netmod/ofi/ofi_send.h @@ -275,9 +275,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_normal(const void *buf, MPI_Aint cou data_sz >= MPIR_CVAR_CH4_OFI_GPU_PIPELINE_THRESHOLD) { /* Pipeline path */ fi_addr_t remote_addr = MPIDI_OFI_av_to_phys(addr, receiver_nic, vci_remote); + MPIDI_OFI_COMM(comm).pipeline_tag += 1; mpi_errno = MPIDI_OFI_gpu_pipeline_send(sreq, buf, count, datatype, attr, data_sz, cq_data, remote_addr, vci_local, ctx_idx, - match_bits); + match_bits, MPIDI_OFI_COMM(comm).pipeline_tag); MPIR_ERR_CHECK(mpi_errno); MPIR_T_PVAR_COUNTER_INC(MULTINIC, nic_sent_bytes_count[sender_nic], data_sz); diff --git a/src/mpid/ch4/netmod/ofi/ofi_types.h b/src/mpid/ch4/netmod/ofi/ofi_types.h index 46f49ea4478..3d3d6cf7003 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_types.h +++ b/src/mpid/ch4/netmod/ofi/ofi_types.h @@ -37,14 +37,12 @@ #define MPIDI_OFI_IDATA_ERROR_BITS (2) /* The number of bits in the immediate data field allocated to the source rank and error propagation. */ #define MPIDI_OFI_IDATA_SRC_ERROR_BITS (MPIDI_OFI_IDATA_SRC_BITS + MPIDI_OFI_IDATA_ERROR_BITS) -/* The number of bits in the immediate data field allocated to MPI_Packed datatype for GPU. */ -#define MPIDI_OFI_IDATA_GPU_PACKED_BITS (1) -/* The offset of bits in the immediate data field allocated to number of message chunks. */ -#define MPIDI_OFI_IDATA_GPUCHUNK_OFFSET (MPIDI_OFI_IDATA_SRC_ERROR_BITS + MPIDI_OFI_IDATA_GPU_PACKED_BITS) /* Bit mask for MPIR_ERR_OTHER */ #define MPIDI_OFI_ERR_OTHER (0x1ULL) /* Bit mask for MPIR_PROC_FAILED */ #define MPIDI_OFI_ERR_PROC_FAILED (0x2ULL) +/* Bit mask for gpu pipeline */ +#define MPIDI_OFI_IDATA_PIPELINE (1ULL << MPIDI_OFI_IDATA_SRC_ERROR_BITS) /* Set the error bits */ MPL_STATIC_INLINE_PREFIX void MPIDI_OFI_idata_set_error_bits(uint64_t * data_field, @@ -75,30 +73,6 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_idata_get_error_bits(uint64_t idata) } } -/* Set the gpu packed bit */ -static inline void MPIDI_OFI_idata_set_gpu_packed_bit(uint64_t * data_field, uint64_t is_packed) -{ - *data_field = (*data_field) | (is_packed << MPIDI_OFI_IDATA_SRC_ERROR_BITS); -} - -/* Get the gpu packed bit from the OFI data field. */ -static inline uint32_t MPIDI_OFI_idata_get_gpu_packed_bit(uint64_t idata) -{ - return (idata >> MPIDI_OFI_IDATA_SRC_ERROR_BITS) & 0x1ULL; -} - -/* Set gpu chunk bits */ -static inline void MPIDI_OFI_idata_set_gpuchunk_bits(uint64_t * data_field, uint64_t n_chunks) -{ - *data_field = (*data_field) | (n_chunks << MPIDI_OFI_IDATA_GPUCHUNK_OFFSET); -} - -/* Get gpu chunks from the OFI data field. */ -static inline uint32_t MPIDI_OFI_idata_get_gpuchunk_bits(uint64_t idata) -{ - return (idata >> MPIDI_OFI_IDATA_GPUCHUNK_OFFSET); -} - /* There are 4 protocol bits: * - MPIDI_DYNPROC_SEND * - MPIDI_OFI_HUGE_SEND From 3e741c7563a49320785977543ce36b99110f1e24 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Thu, 8 Feb 2024 11:56:19 -0600 Subject: [PATCH 14/17] ch4/ofi: refactor gpu pipeline recv_alloc Separate the recv tasks between the initial header and chunks since the paths clearly separates them. Use a single async item for all chunk recvs rather than unnecessarily enqueuing individual chunks since we can track the chunks in the state. --- src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c | 83 +++++++++++++++------- 1 file changed, 56 insertions(+), 27 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c index 3c4e17db02f..496a6b78a6e 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c +++ b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c @@ -21,7 +21,7 @@ struct pipeline_header { static void spawn_send_copy(MPIR_Async_thing * thing, MPIR_Request * sreq, MPIR_async_req * areq, const void *buf, MPI_Aint chunk_sz); -static int start_recv_chunk(MPIR_Request * rreq, int idx, int n_chunks); +static int start_recv_chunk(MPIR_Request * rreq, int n_chunks); static int start_recv_copy(MPIR_Request * rreq, void *buf, MPI_Aint chunk_sz, void *recv_buf, MPI_Aint count, MPI_Datatype datatype); @@ -245,14 +245,14 @@ int MPIDI_OFI_gpu_pipeline_send_event(struct fi_cq_tagged_entry *wc, MPIR_Reques /* ------------------------------------ * recv_alloc: allocate recv chunk buffer and post fi_trecv + * There are actually two async things: issuing the initial recv and + * issuing recvs for the rest of the chunks. */ -struct recv_alloc { - MPIR_Request *rreq; - struct chunk_req *chunk_req; - int n_chunks; -}; -static int recv_alloc_poll(MPIR_Async_thing * thing); +/* the state for recv_init_alloc is just the rreq */ + +static bool issue_recv_alloc(MPIR_Request * rreq, bool is_init); +static int recv_init_alloc_poll(MPIR_Async_thing * thing); int MPIDI_OFI_gpu_pipeline_recv(MPIR_Request * rreq, void *recv_buf, MPI_Aint count, MPI_Datatype datatype, @@ -278,38 +278,53 @@ int MPIDI_OFI_gpu_pipeline_recv(MPIR_Request * rreq, MPIDI_OFI_REQUEST(rreq, noncontig.pack.count) = count; MPIDI_OFI_REQUEST(rreq, noncontig.pack.datatype) = datatype; - struct recv_alloc *p; - p = MPL_malloc(sizeof(*p), MPL_MEM_OTHER); - MPIR_Assert(p); + mpi_errno = MPIR_Async_things_add(recv_init_alloc_poll, rreq); - p->rreq = rreq; - p->n_chunks = -1; /* it's MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT */ + return mpi_errno; +} - mpi_errno = MPIR_Async_things_add(recv_alloc_poll, p); +static int recv_init_alloc_poll(MPIR_Async_thing * thing) +{ + MPIR_Request *rreq = MPIR_Async_thing_get_state(thing); - return mpi_errno; + int ret = issue_recv_alloc(rreq, true /* is_init */); + if (ret) { + return MPIR_ASYNC_THING_DONE; + } + + return MPIR_ASYNC_THING_NOPROGRESS; } +/* ---- */ +struct recv_chunk_alloc { + MPIR_Request *rreq; + int n_chunks; + int issued_chunks; +}; + +static int recv_chunk_alloc_poll(MPIR_Async_thing * thing); + /* this is called from recv_event */ -static int start_recv_chunk(MPIR_Request * rreq, int idx, int n_chunks) +static int start_recv_chunk(MPIR_Request * rreq, int n_chunks) { int mpi_errno = MPI_SUCCESS; - struct recv_alloc *p; + struct recv_chunk_alloc *p; p = MPL_malloc(sizeof(*p), MPL_MEM_OTHER); MPIR_Assert(p); p->rreq = rreq; p->n_chunks = n_chunks; + p->issued_chunks = 0; - mpi_errno = MPIR_Async_things_add(recv_alloc_poll, p); + mpi_errno = MPIR_Async_things_add(recv_chunk_alloc_poll, p); return mpi_errno; } -static int recv_alloc_poll(MPIR_Async_thing * thing) +static int recv_chunk_alloc_poll(MPIR_Async_thing * thing) { - struct recv_alloc *p = MPIR_Async_thing_get_state(thing); + struct recv_chunk_alloc *p = MPIR_Async_thing_get_state(thing); MPIR_Request *rreq = p->rreq; /* arbitrary threshold */ @@ -317,6 +332,23 @@ static int recv_alloc_poll(MPIR_Async_thing * thing) return MPIR_ASYNC_THING_NOPROGRESS; } + bool ret = issue_recv_alloc(rreq, false /* is_init */); + if (ret) { + p->issued_chunks++; + if (p->issued_chunks == p->n_chunks) { + MPL_free(p); + return MPIR_ASYNC_THING_DONE; + } else { + return MPIR_ASYNC_THING_UPDATED; + } + } + + return MPIR_ASYNC_THING_NOPROGRESS; +} + +/* ---- */ +static bool issue_recv_alloc(MPIR_Request * rreq, bool is_init) +{ void *host_buf; MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, &host_buf); if (!host_buf) { @@ -336,7 +368,7 @@ static int recv_alloc_poll(MPIR_Async_thing * thing) chunk_req->buf = host_buf; uint64_t match_bits; - if (p->n_chunks == -1) { + if (is_init) { match_bits = MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.match_bits); chunk_req->event_id = MPIDI_OFI_EVENT_RECV_GPU_PIPELINE_INIT; } else { @@ -351,9 +383,8 @@ static int recv_alloc_poll(MPIR_Async_thing * thing) MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(vci).lock); if (ret == 0) { MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_inrecv) += 1; - MPL_free(p); /* chunk_req and host_buf will be freed in recv_events */ - return MPIR_ASYNC_THING_DONE; + return true; } if (ret != -FI_EAGAIN && ret != -FI_ENOMEM) { /* unexpected error */ @@ -361,7 +392,7 @@ static int recv_alloc_poll(MPIR_Async_thing * thing) } MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, host_buf); MPL_free(chunk_req); - return MPIR_ASYNC_THING_NOPROGRESS; + return false; }; /* ------------------------------------ @@ -405,10 +436,8 @@ int MPIDI_OFI_gpu_pipeline_recv_event(struct fi_cq_tagged_entry *wc, MPIR_Reques /* There is no data in the init chunk, free the buffer */ MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, host_buf); /* Post recv for the remaining chunks. */ - for (int i = 0; i < p_hdr->n_chunks; i++) { - mpi_errno = start_recv_chunk(rreq, i, p_hdr->n_chunks); - MPIR_ERR_CHECK(mpi_errno); - } + mpi_errno = start_recv_chunk(rreq, p_hdr->n_chunks); + MPIR_ERR_CHECK(mpi_errno); } } else { MPIR_Assert(event_id == MPIDI_OFI_EVENT_RECV_GPU_PIPELINE); From 2ff20487d718fa2119c4d99495e7d06ac370c8be Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Thu, 8 Feb 2024 10:22:59 -0600 Subject: [PATCH 15/17] ch4/ofi: include ofi_impl.h in ofi_gpu_pipeline.c It is needed to compile under noinline configuration. --- src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c index 496a6b78a6e..02def785b2d 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c +++ b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c @@ -4,6 +4,7 @@ */ #include "mpidimpl.h" +#include "ofi_impl.h" #include "mpir_async_things.h" struct chunk_req { From 8e5a2c22c4bef2f7fc52aa4b74e5d41df3010f89 Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Wed, 7 Feb 2024 14:06:42 -0600 Subject: [PATCH 16/17] ch4/ofi: move some inline util functions Move these utility functions to ofi_impl.h since they are simple and non-specific. It also simplifies figuring out which file to include especially for .c files. --- src/mpid/ch4/netmod/ofi/ofi_events.h | 25 ------------------ src/mpid/ch4/netmod/ofi/ofi_impl.h | 38 ++++++++++++++++++++++++++++ src/mpid/ch4/netmod/ofi/ofi_send.h | 13 ---------- 3 files changed, 38 insertions(+), 38 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_events.h b/src/mpid/ch4/netmod/ofi/ofi_events.h index 036a49b5671..d1655901db8 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_events.h +++ b/src/mpid/ch4/netmod/ofi/ofi_events.h @@ -14,31 +14,6 @@ int MPIDI_OFI_rma_done_event(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * in_req); int MPIDI_OFI_dispatch_function(int vci, struct fi_cq_tagged_entry *wc, MPIR_Request * req); -MPL_STATIC_INLINE_PREFIX MPL_gpu_engine_type_t MPIDI_OFI_gpu_get_recv_engine_type(int cvar) -{ - if (cvar == MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE_compute) { - return MPL_GPU_ENGINE_TYPE_COMPUTE; - } else if (cvar == MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE_copy_high_bandwidth) { - return MPL_GPU_ENGINE_TYPE_COPY_HIGH_BANDWIDTH; - } else if (cvar == MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE_copy_low_latency) { - return MPL_GPU_ENGINE_TYPE_COPY_LOW_LATENCY; - } else { - return MPL_GPU_ENGINE_TYPE_LAST; - } -} - -MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_cqe_get_source(struct fi_cq_tagged_entry *wc, bool has_err) -{ - if (MPIDI_OFI_ENABLE_DATA) { - if (unlikely(has_err)) { - return wc->data & ((1 << MPIDI_OFI_IDATA_SRC_BITS) - 1); - } - return wc->data; - } else { - return MPIDI_OFI_init_get_source(wc->tag); - } -} - MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_event(int vci, struct fi_cq_tagged_entry *wc /* unused */ , MPIR_Request * sreq, int event_id) diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h index 698d68fac8e..fcad19a723f 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_impl.h +++ b/src/mpid/ch4/netmod/ofi/ofi_impl.h @@ -827,6 +827,44 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_gpu_free_pack_buffer(void *ptr) } } +MPL_STATIC_INLINE_PREFIX MPL_gpu_engine_type_t MPIDI_OFI_gpu_get_send_engine_type(int cvar) +{ + if (cvar == MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE_compute) { + return MPL_GPU_ENGINE_TYPE_COMPUTE; + } else if (cvar == MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE_copy_high_bandwidth) { + return MPL_GPU_ENGINE_TYPE_COPY_HIGH_BANDWIDTH; + } else if (cvar == MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE_copy_low_latency) { + return MPL_GPU_ENGINE_TYPE_COPY_LOW_LATENCY; + } else { + return MPL_GPU_ENGINE_TYPE_LAST; + } +} + +MPL_STATIC_INLINE_PREFIX MPL_gpu_engine_type_t MPIDI_OFI_gpu_get_recv_engine_type(int cvar) +{ + if (cvar == MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE_compute) { + return MPL_GPU_ENGINE_TYPE_COMPUTE; + } else if (cvar == MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE_copy_high_bandwidth) { + return MPL_GPU_ENGINE_TYPE_COPY_HIGH_BANDWIDTH; + } else if (cvar == MPIR_CVAR_CH4_OFI_GPU_RECEIVE_ENGINE_TYPE_copy_low_latency) { + return MPL_GPU_ENGINE_TYPE_COPY_LOW_LATENCY; + } else { + return MPL_GPU_ENGINE_TYPE_LAST; + } +} + +MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_cqe_get_source(struct fi_cq_tagged_entry *wc, bool has_err) +{ + if (MPIDI_OFI_ENABLE_DATA) { + if (unlikely(has_err)) { + return wc->data & ((1 << MPIDI_OFI_IDATA_SRC_BITS) - 1); + } + return wc->data; + } else { + return MPIDI_OFI_init_get_source(wc->tag); + } +} + int MPIDI_OFI_gpu_pipeline_send(MPIR_Request * sreq, const void *send_buf, MPI_Aint count, MPI_Datatype datatype, MPL_pointer_attr_t attr, MPI_Aint data_sz, diff --git a/src/mpid/ch4/netmod/ofi/ofi_send.h b/src/mpid/ch4/netmod/ofi/ofi_send.h index dc30330a84c..34d1fcb04dd 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_send.h +++ b/src/mpid/ch4/netmod/ofi/ofi_send.h @@ -8,19 +8,6 @@ #include "ofi_impl.h" -MPL_STATIC_INLINE_PREFIX MPL_gpu_engine_type_t MPIDI_OFI_gpu_get_send_engine_type(int cvar) -{ - if (cvar == MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE_compute) { - return MPL_GPU_ENGINE_TYPE_COMPUTE; - } else if (cvar == MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE_copy_high_bandwidth) { - return MPL_GPU_ENGINE_TYPE_COPY_HIGH_BANDWIDTH; - } else if (cvar == MPIR_CVAR_CH4_OFI_GPU_SEND_ENGINE_TYPE_copy_low_latency) { - return MPL_GPU_ENGINE_TYPE_COPY_LOW_LATENCY; - } else { - return MPL_GPU_ENGINE_TYPE_LAST; - } -} - MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send_lightweight(const void *buf, size_t data_sz, uint64_t cq_data, From 8aacd180a3ab17c29663e603868f14bbd93e7a6e Mon Sep 17 00:00:00 2001 From: Hui Zhou Date: Thu, 29 Feb 2024 08:45:45 -0600 Subject: [PATCH 17/17] ch4/ofi: remove limit in pipeline recv chunk progress Remove the limit in posting gpu pipeline recv chunks. The limit can be controlled by the maximum chunks from MPIDI_OFI_global.gpu_pipeline_recv_pool or when the libfabric return EAGAIN. In progressing the recv_chunk_alloc, we'll issue as many chunks as we can instead of one at a time. Refactor the code to have single exit point. --- src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c | 47 +++++++++++++--------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c index 02def785b2d..96295a9d340 100644 --- a/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c +++ b/src/mpid/ch4/netmod/ofi/ofi_gpu_pipeline.c @@ -325,35 +325,39 @@ static int start_recv_chunk(MPIR_Request * rreq, int n_chunks) static int recv_chunk_alloc_poll(MPIR_Async_thing * thing) { + int ret = MPIR_ASYNC_THING_NOPROGRESS; struct recv_chunk_alloc *p = MPIR_Async_thing_get_state(thing); MPIR_Request *rreq = p->rreq; - /* arbitrary threshold */ - if (MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_inrecv) > 1) { - return MPIR_ASYNC_THING_NOPROGRESS; - } + while (true) { + bool rc = issue_recv_alloc(rreq, false /* is_init */); + if (!rc) { + goto fn_exit; + } - bool ret = issue_recv_alloc(rreq, false /* is_init */); - if (ret) { p->issued_chunks++; if (p->issued_chunks == p->n_chunks) { MPL_free(p); - return MPIR_ASYNC_THING_DONE; + ret = MPIR_ASYNC_THING_DONE; + goto fn_exit; } else { - return MPIR_ASYNC_THING_UPDATED; + ret = MPIR_ASYNC_THING_UPDATED; } } - return MPIR_ASYNC_THING_NOPROGRESS; + fn_exit: + return ret; } /* ---- */ static bool issue_recv_alloc(MPIR_Request * rreq, bool is_init) { + bool ret = false; void *host_buf; + MPIDU_genq_private_pool_alloc_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, &host_buf); if (!host_buf) { - return MPIR_ASYNC_THING_NOPROGRESS; + goto fn_exit; } fi_addr_t remote_addr = MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.remote_addr); @@ -378,22 +382,25 @@ static bool issue_recv_alloc(MPIR_Request * rreq, bool is_init) chunk_req->event_id = MPIDI_OFI_EVENT_RECV_GPU_PIPELINE; } MPID_THREAD_CS_ENTER(VCI, MPIDI_VCI(vci).lock); - int ret = fi_trecv(MPIDI_OFI_global.ctx[ctx_idx].rx, - host_buf, MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ, NULL, remote_addr, - match_bits, mask_bits, (void *) &chunk_req->context); + int rc = fi_trecv(MPIDI_OFI_global.ctx[ctx_idx].rx, + host_buf, MPIR_CVAR_CH4_OFI_GPU_PIPELINE_BUFFER_SZ, NULL, remote_addr, + match_bits, mask_bits, (void *) &chunk_req->context); MPID_THREAD_CS_EXIT(VCI, MPIDI_VCI(vci).lock); - if (ret == 0) { + if (rc == 0) { MPIDI_OFI_REQUEST(rreq, pipeline_info.recv.num_inrecv) += 1; /* chunk_req and host_buf will be freed in recv_events */ - return true; - } - if (ret != -FI_EAGAIN && ret != -FI_ENOMEM) { - /* unexpected error */ - MPIR_Assert(0); + ret = true; + goto fn_exit; } + + /* assert unexpected error */ + MPIR_Assert(rc != -FI_EAGAIN && rc != -FI_ENOMEM); + MPIDU_genq_private_pool_free_cell(MPIDI_OFI_global.gpu_pipeline_recv_pool, host_buf); MPL_free(chunk_req); - return false; + + fn_exit: + return ret; }; /* ------------------------------------