Skip to content

Commit 09dd383

Browse files
authored
Merge pull request #7108 from devreal/btl-ugni-deadlock
uGNI: Fix potential deadlock when processing outstanding transfers
2 parents 40e2fbb + c09ca03 commit 09dd383

File tree

1 file changed

+21
-9
lines changed

1 file changed

+21
-9
lines changed

opal/mca/btl/ugni/btl_ugni_component.c

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -666,34 +666,46 @@ static inline int
666666
mca_btl_ugni_progress_wait_list (mca_btl_ugni_module_t *ugni_module)
667667
{
668668
int rc = OPAL_SUCCESS;
669+
opal_list_t tmplist;
670+
opal_list_t *waitlist = &ugni_module->ep_wait_list;
669671
mca_btl_base_endpoint_t *endpoint = NULL;
670672
int count;
671673

672-
if (0 == opal_list_get_size(&ugni_module->ep_wait_list)) {
673-
return 0;
674-
}
675-
676674
/* check the count before taking the lock to avoid unnecessary locking */
677-
count = opal_list_get_size(&ugni_module->ep_wait_list);
675+
count = opal_list_get_size(waitlist);
678676
if (0 == count) {
679677
return 0;
680678
}
681679

680+
/* Don't hold the wait-list lock while processing the list as that may lead
681+
* to a deadlock.
682+
* Instead, move the wait_list elements into a temporary list and work on that.*/
683+
OBJ_CONSTRUCT(&tmplist, opal_list_t);
682684
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
683-
count = opal_list_get_size(&ugni_module->ep_wait_list);
685+
opal_list_join(&tmplist, opal_list_get_end(&tmplist), waitlist);
686+
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
687+
count = opal_list_get_size(&tmplist);
684688
do {
685-
endpoint = (mca_btl_base_endpoint_t *) opal_list_remove_first (&ugni_module->ep_wait_list);
689+
endpoint = (mca_btl_base_endpoint_t *) opal_list_remove_first (&tmplist);
686690
if (endpoint != NULL) {
687691
rc = mca_btl_ugni_progress_send_wait_list (endpoint);
688692

689693
if (OPAL_SUCCESS != rc) {
690-
opal_list_append (&ugni_module->ep_wait_list, &endpoint->super);
694+
opal_list_append (&tmplist, &endpoint->super);
691695
} else {
692696
endpoint->wait_listed = false;
693697
}
694698
}
695699
} while (endpoint != NULL && --count > 0) ;
696-
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
700+
701+
/* reinsert unfinished elements into the wait-list */
702+
count = opal_list_get_size(&tmplist);
703+
if (0 < count) {
704+
OPAL_THREAD_LOCK(&ugni_module->ep_wait_list_lock);
705+
opal_list_join(waitlist, opal_list_get_end(waitlist), &tmplist);
706+
OPAL_THREAD_UNLOCK(&ugni_module->ep_wait_list_lock);
707+
}
708+
OBJ_DESTRUCT(&tmplist);
697709

698710
return rc;
699711
}

0 commit comments

Comments
 (0)