Skip to content

Commit

Permalink
pml/ob1: fix double increment of the RDMA frag retry counter
Browse files Browse the repository at this point in the history
If a put or get operation fails it may later be retried by
mca_pml_ob1_process_pending_rdma which increments retries on each new attempt.
There is a flaw in the code where both the put and get failures also increment
this counter leading to it giving up twice as fast. This commit removes the
increments on the put and get failures.

Signed-off-by: Nathan Hjelm <[email protected]>
  • Loading branch information
hjelmn committed Sep 19, 2024
1 parent 020e83f commit 27efeb9
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 19 deletions.
2 changes: 1 addition & 1 deletion ompi/mca/pml/ob1/pml_ob1_recvreq.c
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ static int mca_pml_ob1_recv_request_get_frag_failed (mca_pml_ob1_rdma_frag_t *fr
}
}

if (++frag->retries < mca_pml_ob1.rdma_retries_limit &&
if (frag->retries < mca_pml_ob1.rdma_retries_limit &&
OMPI_ERR_OUT_OF_RESOURCE == rc) {
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
Expand Down
37 changes: 19 additions & 18 deletions ompi/mca/pml/ob1/pml_ob1_sendreq.c
Original file line number Diff line number Diff line change
Expand Up @@ -1275,30 +1275,31 @@ static void mca_pml_ob1_send_request_put_frag_failed (mca_pml_ob1_rdma_frag_t *f
mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
mca_bml_base_btl_t *bml_btl = frag->rdma_bml;

if (++frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) {
if (frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) {
/* queue the frag for later if there was a resource error */
OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
} else {
return;
}

#if OPAL_ENABLE_FT
if(!ompi_proc_is_active(sendreq->req_send.req_base.req_proc)) {
return;
}
#endif /* OPAL_ENABLE_FT */
/* tell receiver to deregister memory */
mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl,
frag->rdma_hdr.hdr_rdma.hdr_frag, 0, MCA_BTL_NO_ORDER,
OPAL_ERR_TEMP_OUT_OF_RESOURCE);

/* send fragment by copy in/out */
mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset,
frag->rdma_length);
/* if a pointer to a receive request is not set it means that
* ACK was not yet received. Don't schedule sends before ACK */
if (NULL != sendreq->req_recv.pval)
mca_pml_ob1_send_request_schedule (sendreq);
if(!ompi_proc_is_active(sendreq->req_send.req_base.req_proc)) {
return;
}
#endif /* OPAL_ENABLE_FT */
/* tell receiver to deregister memory */
mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl,
frag->rdma_hdr.hdr_rdma.hdr_frag, 0, MCA_BTL_NO_ORDER,
OPAL_ERR_TEMP_OUT_OF_RESOURCE);

/* send fragment by copy in/out */
mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset,
frag->rdma_length);
/* if a pointer to a receive request is not set it means that
* ACK was not yet received. Don't schedule sends before ACK */
if (NULL != sendreq->req_recv.pval)
mca_pml_ob1_send_request_schedule (sendreq);
}

/**
Expand Down

0 comments on commit 27efeb9

Please sign in to comment.