|
22 | 22 | * Copyright (c) 2018-2019 Triad National Security, LLC. All rights
|
23 | 23 | * reserved.
|
24 | 24 | * Copyright (c) 2022 IBM Corporation. All rights reserved.
|
| 25 | + * Copyright (c) 2024 Google, LLC. All rights reserved. |
25 | 26 | * $COPYRIGHT$
|
26 | 27 | *
|
27 | 28 | * Additional copyrights may follow
|
@@ -1110,6 +1111,12 @@ mca_pml_ob1_send_request_schedule_once(mca_pml_ob1_send_request_t* sendreq)
|
1110 | 1111 |
|
1111 | 1112 | range = get_send_range(sendreq);
|
1112 | 1113 |
|
| 1114 | + if (NULL != sendreq->rdma_frag) { |
| 1115 | + /* this request was first attempted with RDMA but is now using send/recv */ |
| 1116 | + MCA_PML_OB1_RDMA_FRAG_RETURN(sendreq->rdma_frag); |
| 1117 | + sendreq->rdma_frag = NULL; |
| 1118 | + } |
| 1119 | + |
1113 | 1120 | while(range && (false == sendreq->req_throttle_sends ||
|
1114 | 1121 | sendreq->req_pipeline_depth < mca_pml_ob1.send_pipeline_depth)) {
|
1115 | 1122 | mca_pml_ob1_frag_hdr_t* hdr;
|
@@ -1268,30 +1275,31 @@ static void mca_pml_ob1_send_request_put_frag_failed (mca_pml_ob1_rdma_frag_t *f
|
1268 | 1275 | mca_pml_ob1_send_request_t* sendreq = (mca_pml_ob1_send_request_t *) frag->rdma_req;
|
1269 | 1276 | mca_bml_base_btl_t *bml_btl = frag->rdma_bml;
|
1270 | 1277 |
|
1271 |
| - if (++frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) { |
| 1278 | + if (frag->retries < mca_pml_ob1.rdma_retries_limit && OMPI_ERR_OUT_OF_RESOURCE == rc) { |
1272 | 1279 | /* queue the frag for later if there was a resource error */
|
1273 | 1280 | OPAL_THREAD_LOCK(&mca_pml_ob1.lock);
|
1274 | 1281 | opal_list_append(&mca_pml_ob1.rdma_pending, (opal_list_item_t*)frag);
|
1275 | 1282 | OPAL_THREAD_UNLOCK(&mca_pml_ob1.lock);
|
1276 |
| - } else { |
| 1283 | + return; |
| 1284 | + } |
| 1285 | + |
1277 | 1286 | #if OPAL_ENABLE_FT
|
1278 |
| - if(!ompi_proc_is_active(sendreq->req_send.req_base.req_proc)) { |
1279 |
| - return; |
1280 |
| - } |
1281 |
| -#endif /* OPAL_ENABLE_FT */ |
1282 |
| - /* tell receiver to deregister memory */ |
1283 |
| - mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl, |
1284 |
| - frag->rdma_hdr.hdr_rdma.hdr_frag, 0, MCA_BTL_NO_ORDER, |
1285 |
| - OPAL_ERR_TEMP_OUT_OF_RESOURCE); |
1286 |
| - |
1287 |
| - /* send fragment by copy in/out */ |
1288 |
| - mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, |
1289 |
| - frag->rdma_length); |
1290 |
| - /* if a pointer to a receive request is not set it means that |
1291 |
| - * ACK was not yet received. Don't schedule sends before ACK */ |
1292 |
| - if (NULL != sendreq->req_recv.pval) |
1293 |
| - mca_pml_ob1_send_request_schedule (sendreq); |
| 1287 | + if(!ompi_proc_is_active(sendreq->req_send.req_base.req_proc)) { |
| 1288 | + return; |
1294 | 1289 | }
|
| 1290 | +#endif /* OPAL_ENABLE_FT */ |
| 1291 | + /* tell receiver to deregister memory */ |
| 1292 | + mca_pml_ob1_send_fin (sendreq->req_send.req_base.req_proc, bml_btl, |
| 1293 | + frag->rdma_hdr.hdr_rdma.hdr_frag, 0, MCA_BTL_NO_ORDER, |
| 1294 | + OPAL_ERR_TEMP_OUT_OF_RESOURCE); |
| 1295 | + |
| 1296 | + /* send fragment by copy in/out */ |
| 1297 | + mca_pml_ob1_send_request_copy_in_out(sendreq, frag->rdma_hdr.hdr_rdma.hdr_rdma_offset, |
| 1298 | + frag->rdma_length); |
| 1299 | + /* if a pointer to a receive request is not set it means that |
| 1300 | + * ACK was not yet received. Don't schedule sends before ACK */ |
| 1301 | + if (NULL != sendreq->req_recv.pval) |
| 1302 | + mca_pml_ob1_send_request_schedule (sendreq); |
1295 | 1303 | }
|
1296 | 1304 |
|
1297 | 1305 | /**
|
|
0 commit comments