Skip to content

Commit a081fba

Browse files
author
Sergey Oblomov
committed
OSC/UCX: fixed hang on OSC init
- there worked progress was missed on startup which caused hang on one of ranks Signed-off-by: Sergey Oblomov <[email protected]>
1 parent 4447738 commit a081fba

File tree

3 files changed

+12
-10
lines changed

3 files changed

+12
-10
lines changed

ompi/mca/osc/ucx/osc_ucx.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ typedef struct ompi_osc_ucx_component {
3838
opal_free_list_t requests; /* request free list for the r* communication variants */
3939
bool env_initialized; /* UCX environment is initialized or not */
4040
int num_incomplete_req_ops;
41+
int init_in_progress;
4142
unsigned int priority;
4243
} ompi_osc_ucx_component_t;
4344

ompi/mca/osc/ucx/osc_ucx_component.c

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,12 @@ ompi_osc_ucx_component_t mca_osc_ucx_component = {
4545
.osc_query = component_query,
4646
.osc_select = component_select,
4747
.osc_finalize = component_finalize,
48-
}
48+
},
49+
.ucp_context = NULL,
50+
.ucp_worker = NULL,
51+
.env_initialized = false,
52+
.num_incomplete_req_ops = 0,
53+
.init_in_progress = 1
4954
};
5055

5156
ompi_osc_ucx_module_t ompi_osc_ucx_module_template = {
@@ -105,24 +110,19 @@ static int component_register(void) {
105110
}
106111

107112
static int progress_callback(void) {
108-
if (mca_osc_ucx_component.ucp_worker != NULL &&
109-
mca_osc_ucx_component.num_incomplete_req_ops > 0) {
113+
if ((mca_osc_ucx_component.ucp_worker != NULL) &&
114+
(mca_osc_ucx_component.num_incomplete_req_ops +
115+
mca_osc_ucx_component.init_in_progress > 0)) {
110116
ucp_worker_progress(mca_osc_ucx_component.ucp_worker);
111117
}
112118
return 0;
113119
}
114120

115121
static int component_init(bool enable_progress_threads, bool enable_mpi_threads) {
116-
int ret = OMPI_SUCCESS;
117-
118-
mca_osc_ucx_component.ucp_context = NULL;
119-
mca_osc_ucx_component.ucp_worker = NULL;
120122
mca_osc_ucx_component.enable_mpi_threads = enable_mpi_threads;
121-
mca_osc_ucx_component.env_initialized = false;
122-
mca_osc_ucx_component.num_incomplete_req_ops = 0;
123123

124124
opal_common_ucx_mca_register();
125-
return ret;
125+
return OMPI_SUCCESS;
126126
}
127127

128128
static int component_finalize(void) {

ompi/mca/osc/ucx/osc_ucx_request.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ void req_completion(void *request, ucs_status_t status) {
5757
ompi_request_complete(&(req->external_req->super), true);
5858
ucp_request_release(req);
5959
mca_osc_ucx_component.num_incomplete_req_ops--;
60+
mca_osc_ucx_component.init_in_progress = 0;
6061
assert(mca_osc_ucx_component.num_incomplete_req_ops >= 0);
6162
}
6263
}

0 commit comments

Comments
 (0)