Skip to content

Commit ae7fe9a

Browse files
authored
Merge pull request #12465 from hppritcha/fix_for_intercomm_cid_alloc
comm/cid: fix for edge case with intercomm creation
2 parents 85fd1bc + 15a3d26 commit ae7fe9a

File tree

6 files changed

+36
-13
lines changed

6 files changed

+36
-13
lines changed

ompi/communicator/comm.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2200,7 +2200,7 @@ int ompi_comm_free( ompi_communicator_t **comm )
22002200
* makes sure that the pointer to the dependent communicator
22012201
* still contains a valid object.
22022202
*/
2203-
ompi_communicator_t *tmpcomm = (ompi_communicator_t *) opal_pointer_array_get_item(&ompi_mpi_communicators, cid);
2203+
ompi_communicator_t *tmpcomm = ompi_comm_lookup(cid);
22042204
if ( NULL != tmpcomm ){
22052205
ompi_comm_free(&tmpcomm);
22062206
}

ompi/communicator/comm_cid.c

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -590,7 +590,7 @@ static int ompi_comm_allreduce_getnextcid (ompi_comm_request_t *request)
590590
context->nextlocal_cid = mca_pml.pml_max_contextid;
591591
for (unsigned int i = context->start ; i < mca_pml.pml_max_contextid ; ++i) {
592592
flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i,
593-
context->comm);
593+
OMPI_COMM_SENTINEL);
594594
if (true == flag) {
595595
context->nextlocal_cid = i;
596596
break;
@@ -664,7 +664,7 @@ static int ompi_comm_checkcid (ompi_comm_request_t *request)
664664
opal_pointer_array_set_item(&ompi_mpi_communicators, context->nextlocal_cid, NULL);
665665

666666
context->flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators,
667-
context->nextcid, context->comm);
667+
context->nextcid, OMPI_COMM_SENTINEL);
668668
}
669669
}
670670

@@ -716,7 +716,7 @@ static int ompi_comm_nextcid_check_flag (ompi_comm_request_t *request)
716716
for (unsigned int i = context->start ; i < mca_pml.pml_max_contextid ; ++i) {
717717
bool flag;
718718
flag = opal_pointer_array_test_and_set_item (&ompi_mpi_communicators, i,
719-
context->comm);
719+
OMPI_COMM_SENTINEL);
720720
if (true == flag) {
721721
context->nextlocal_cid = i;
722722
break;
@@ -1588,4 +1588,3 @@ static int ompi_comm_ft_allreduce_intra_pmix_nb(int *inbuf, int *outbuf, int cou
15881588
}
15891589

15901590
#endif /* OPAL_ENABLE_FT_MPI */
1591-

ompi/communicator/comm_init.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -355,13 +355,13 @@ static int ompi_comm_finalize (void)
355355
OBJ_DESTRUCT( &ompi_mpi_comm_null );
356356

357357
/* Check whether we have some communicators left */
358-
max = opal_pointer_array_get_size(&ompi_mpi_communicators);
358+
max = ompi_comm_get_num_communicators();
359359
for ( i=3; i<max; i++ ) {
360-
comm = (ompi_communicator_t *)opal_pointer_array_get_item(&ompi_mpi_communicators, i);
360+
comm = ompi_comm_lookup(i);
361361
if ( NULL != comm ) {
362362
/* Communicator has not been freed before finalize */
363363
OBJ_RELEASE(comm);
364-
comm=(ompi_communicator_t *)opal_pointer_array_get_item(&ompi_mpi_communicators, i);
364+
comm = ompi_comm_lookup(i);
365365
if ( NULL != comm ) {
366366
/* Still here ? */
367367
if ( !OMPI_COMM_IS_EXTRA_RETAIN(comm)) {

ompi/communicator/communicator.h

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,11 @@ OMPI_DECLSPEC OBJ_CLASS_DECLARATION(ompi_communicator_t);
140140
#define OMPI_COMM_BLOCK_WORLD 16
141141
#define OMPI_COMM_BLOCK_OTHERS 8
142142

143+
/**
144+
* Placeholder to use in array of ompi communicators during CID allocation
145+
*/
146+
#define OMPI_COMM_SENTINEL 0x00000001
147+
143148
/* A macro comparing two CIDs */
144149
#define OMPI_COMM_CID_IS_LOWER(comm1,comm2) ( ((comm1)->c_index < (comm2)->c_index)? 1:0)
145150

@@ -552,8 +557,27 @@ static inline bool ompi_comm_compare_cids (const ompi_communicator_t *comm1, con
552557
* No error checking is done*/
553558
static inline ompi_communicator_t *ompi_comm_lookup (const uint32_t c_index)
554559
{
560+
ompi_communicator_t *comm = NULL;
555561
/* array of pointers to communicators, indexed by context ID */
556-
return (ompi_communicator_t *) opal_pointer_array_get_item (&ompi_mpi_communicators, c_index);
562+
comm = (ompi_communicator_t *) opal_pointer_array_get_item (&ompi_mpi_communicators, c_index);
563+
/*
564+
* OMPI_COMM_SENTINEL indicates the slot is being used for CID allocation
565+
* and is not a valid communicator
566+
*/
567+
if ((ompi_communicator_t *)OMPI_COMM_SENTINEL == comm) {
568+
comm = NULL;
569+
}
570+
571+
return comm;
572+
}
573+
574+
/**
575+
* Number of entries in the ompi_mpi_communicators pointer array.
576+
* Note this includes entries which may have NULL or OMPI_COMM_SENTINEL values.
577+
*/
578+
static inline int ompi_comm_get_num_communicators(void)
579+
{
580+
return opal_pointer_array_get_size(&ompi_mpi_communicators);
557581
}
558582

559583
static inline ompi_communicator_t *ompi_comm_lookup_cid (const ompi_comm_extended_cid_t cid)

ompi/dpm/dpm.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1769,9 +1769,9 @@ int ompi_dpm_dyn_finalize(void)
17691769
return OMPI_ERR_OUT_OF_RESOURCE;
17701770
}
17711771

1772-
max = opal_pointer_array_get_size(&ompi_mpi_communicators);
1772+
max = ompi_comm_get_num_communicators();
17731773
for (i=3; i<max; i++) {
1774-
comm = (ompi_communicator_t*)opal_pointer_array_get_item(&ompi_mpi_communicators,i);
1774+
comm = ompi_comm_lookup(i);
17751775
if (NULL != comm && OMPI_COMM_IS_DYNAMIC(comm)) {
17761776
objs[j++] = disconnect_init(comm);
17771777
}

ompi/errhandler/errhandler.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -355,9 +355,9 @@ int ompi_errhandler_proc_failed_internal(ompi_proc_t* ompi_proc, int status, boo
355355

356356
/* Communicator State:
357357
* Let them know about the failure. */
358-
max_num_comm = opal_pointer_array_get_size(&ompi_mpi_communicators);
358+
max_num_comm = ompi_comm_get_num_communicators();
359359
for( i = 0; i < max_num_comm; ++i ) {
360-
comm = (ompi_communicator_t *)opal_pointer_array_get_item(&ompi_mpi_communicators, i);
360+
comm = ompi_comm_lookup(i);
361361
if( NULL == comm ) {
362362
continue;
363363
}

0 commit comments

Comments
 (0)