Skip to content

Commit 9b71f99

Browse files
committed
comm: add pmix timeout knob to group ops
Add pmix timeout option for the pmix group operations used by Open MPI to construct certain types of communicators. Leverage the existing pmix connect mca parameter rather than adding yet another knob. Signed-off-by: Howard Pritchard <[email protected]>
1 parent 9ba5034 commit 9b71f99

File tree

2 files changed

+15
-4
lines changed

2 files changed

+15
-4
lines changed

ompi/communicator/comm_cid.c

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
* Copyright (c) 2017 Mellanox Technologies. All rights reserved.
2525
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
2626
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
27-
* Copyright (c) 2020-2024 Triad National Security, LLC. All rights
27+
* Copyright (c) 2020-2025 Triad National Security, LLC. All rights
2828
* reserved.
2929
* $COPYRIGHT$
3030
*
@@ -320,6 +320,7 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
320320
pmix_proc_t *procs = NULL;
321321
void *grpinfo = NULL, *list = NULL;
322322
pmix_data_array_t darray;
323+
pmix_info_t tinfo;
323324

324325
switch (mode) {
325326
case OMPI_COMM_CID_GROUP_NEW:
@@ -349,6 +350,13 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
349350
goto fn_exit;
350351
}
351352

353+
rc = PMIx_Info_list_add(grpinfo, PMIX_TIMEOUT, &ompi_pmix_connect_timeout, PMIX_UINT32);
354+
if (PMIX_SUCCESS != rc) {
355+
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Info_list_add failed %s %d", PMIx_Error_string(rc), __LINE__));
356+
rc = OMPI_ERR_OUT_OF_RESOURCE;
357+
goto fn_exit;
358+
}
359+
352360
list = PMIx_Info_list_start();
353361

354362
size_t c_index = (size_t)newcomm->c_index;
@@ -450,7 +458,10 @@ static int ompi_comm_ext_cid_new_block (ompi_communicator_t *newcomm, ompi_commu
450458
tag, tproc_count, ninfo, cid_base));
451459

452460
/* destruct the group */
453-
rc = PMIx_Group_destruct (tag, NULL, 0);
461+
PMIX_INFO_CONSTRUCT(&tinfo);
462+
PMIX_INFO_LOAD(&tinfo, PMIX_TIMEOUT, &ompi_pmix_connect_timeout, PMIX_UINT32);
463+
rc = PMIx_Group_destruct (tag, &tinfo, 0);
464+
PMIX_INFO_DESTRUCT(&tinfo);
454465
if(PMIX_SUCCESS != rc) {
455466
OPAL_OUTPUT_VERBOSE((10, ompi_comm_output, "PMIx_Group_destruct failed %s", PMIx_Error_string(rc)));
456467
rc = opal_pmix_convert_status(rc);

ompi/runtime/ompi_mpi_params.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
* All rights reserved.
2121
* Copyright (c) 2016-2021 Research Organization for Information Science
2222
* and Technology (RIST). All rights reserved.
23-
* Copyright (c) 2018-2024 Triad National Security, LLC. All rights
23+
* Copyright (c) 2018-2025 Triad National Security, LLC. All rights
2424
* reserved.
2525
* Copyright (c) 2021 Nanook Consulting. All rights reserved.
2626
* Copyright (c) 2022 IBM Corporation. All rights reserved.
@@ -391,7 +391,7 @@ int ompi_mpi_register_params(void)
391391

392392
ompi_pmix_connect_timeout = 0; /* infinite timeout - see PMIx standard */
393393
(void) mca_base_var_register ("ompi", "mpi", NULL, "pmix_connect_timeout",
394-
"Timeout(secs) for calls to PMIx_Connect. Default is no timeout.",
394+
"Timeout(secs) for calls to PMIx_Connect and PMIx_Group_construct/destruct. Default is no timeout.",
395395
MCA_BASE_VAR_TYPE_UNSIGNED_INT, NULL,
396396
0, 0, OPAL_INFO_LVL_3, MCA_BASE_VAR_SCOPE_LOCAL,
397397
&ompi_pmix_connect_timeout);

0 commit comments

Comments
 (0)