@@ -666,34 +666,46 @@ static inline int
666
666
mca_btl_ugni_progress_wait_list (mca_btl_ugni_module_t * ugni_module )
667
667
{
668
668
int rc = OPAL_SUCCESS ;
669
+ opal_list_t tmplist ;
670
+ opal_list_t * waitlist = & ugni_module -> ep_wait_list ;
669
671
mca_btl_base_endpoint_t * endpoint = NULL ;
670
672
int count ;
671
673
672
- if (0 == opal_list_get_size (& ugni_module -> ep_wait_list )) {
673
- return 0 ;
674
- }
675
-
676
674
/* check the count before taking the lock to avoid unnecessary locking */
677
- count = opal_list_get_size (& ugni_module -> ep_wait_list );
675
+ count = opal_list_get_size (waitlist );
678
676
if (0 == count ) {
679
677
return 0 ;
680
678
}
681
679
680
+ /* Don't hold the wait-list lock while processing the list as that may lead
681
+ * to a deadlock.
682
+ * Instead, move the wait_list elements into a temporary list and work on that.*/
683
+ OBJ_CONSTRUCT (& tmplist , opal_list_t );
682
684
OPAL_THREAD_LOCK (& ugni_module -> ep_wait_list_lock );
683
- count = opal_list_get_size (& ugni_module -> ep_wait_list );
685
+ opal_list_join (& tmplist , opal_list_get_end (& tmplist ), waitlist );
686
+ OPAL_THREAD_UNLOCK (& ugni_module -> ep_wait_list_lock );
687
+ count = opal_list_get_size (& tmplist );
684
688
do {
685
- endpoint = (mca_btl_base_endpoint_t * ) opal_list_remove_first (& ugni_module -> ep_wait_list );
689
+ endpoint = (mca_btl_base_endpoint_t * ) opal_list_remove_first (& tmplist );
686
690
if (endpoint != NULL ) {
687
691
rc = mca_btl_ugni_progress_send_wait_list (endpoint );
688
692
689
693
if (OPAL_SUCCESS != rc ) {
690
- opal_list_append (& ugni_module -> ep_wait_list , & endpoint -> super );
694
+ opal_list_append (& tmplist , & endpoint -> super );
691
695
} else {
692
696
endpoint -> wait_listed = false;
693
697
}
694
698
}
695
699
} while (endpoint != NULL && -- count > 0 ) ;
696
- OPAL_THREAD_UNLOCK (& ugni_module -> ep_wait_list_lock );
700
+
701
+ /* reinsert unfinished elements into the wait-list */
702
+ count = opal_list_get_size (& tmplist );
703
+ if (0 < count ) {
704
+ OPAL_THREAD_LOCK (& ugni_module -> ep_wait_list_lock );
705
+ opal_list_join (waitlist , opal_list_get_end (waitlist ), & tmplist );
706
+ OPAL_THREAD_UNLOCK (& ugni_module -> ep_wait_list_lock );
707
+ }
708
+ OBJ_DESTRUCT (& tmplist );
697
709
698
710
return rc ;
699
711
}
0 commit comments