Skip to content

Commit 673ec16

Browse files
authored
Merge pull request #13024 from gkatev/xhc_fix_sessions_mca_sigsegv
coll/xhc: Fix MCA var related segfault with sessions
2 parents e803e93 + c1e5d81 commit 673ec16

File tree

1 file changed

+27
-5
lines changed

1 file changed

+27
-5
lines changed

ompi/mca/coll/xhc/coll_xhc_component.c

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -110,30 +110,43 @@ mca_coll_xhc_component_t mca_coll_xhc_component = {
110110
.uniform_chunks = true,
111111
.uniform_chunks_min = 4096,
112112

113-
.op_mca[XHC_BCAST] = {
113+
.op_mca = {{0}},
114+
.op_mca_global = {0}
115+
};
116+
117+
/* Rather than having the defaults directly inside the component, we keep
118+
* them in a separate structure and copy them over (in xhc_register()). The
119+
* structs in the component are used as storage for the MCA variables, and
120+
* the MCA system will nullify the storage of string variables when it is
121+
* teared down during Finalize. This is a problem if we have multiple MPI
122+
* Sessions, as we'll have lost our defaults the next time we attempt to
123+
* initialize our MCA variables at the second Init. */
124+
static xhc_op_mca_t op_mca_default[XHC_COLLCOUNT] = {
125+
[XHC_BCAST] = {
114126
.hierarchy = "numa,socket",
115127
.chunk_size = "16K",
116128
.cico_max = 256
117129
},
118130

119-
.op_mca[XHC_BARRIER] = {
131+
[XHC_BARRIER] = {
120132
.hierarchy = "numa,socket",
121133
.chunk_size = "1",
122134
.cico_max = 0
123135
},
124136

125-
.op_mca[XHC_REDUCE] = {
137+
[XHC_REDUCE] = {
126138
.hierarchy = "l3,numa,socket",
127139
.chunk_size = "16K",
128140
.cico_max = 4096
129141
},
130142

131-
.op_mca[XHC_ALLREDUCE] = {
143+
[XHC_ALLREDUCE] = {
132144
.hierarchy = "l3,numa,socket",
133145
.chunk_size = "16K",
134146
.cico_max = 4096
135147
}
136148
};
149+
static xhc_op_mca_t op_mca_global_default = {0};
137150

138151
// -----------------------------
139152

@@ -322,7 +335,7 @@ static int xhc_register(void) {
322335
OBJ_RELEASE(var_enum_flag);
323336

324337
/* (All)reduce uniform chunks */
325-
// ---------------------------
338+
// -----------------------------
326339

327340
mca_base_component_var_register(&mca_coll_xhc_component.super.collm_version,
328341
"uniform_chunks", "Automatically optimize chunk size in reduction "
@@ -336,6 +349,15 @@ static int xhc_register(void) {
336349
NULL, 0, 0, OPAL_INFO_LVL_5, MCA_BASE_VAR_SCOPE_READONLY,
337350
&mca_coll_xhc_component.uniform_chunks_min);
338351

352+
/* Apply the op mca defaults. Gotta do it here rather than in-line in
353+
* the registration loops below, as some iterations are skipped, for the
354+
* variables that are not applicable (e.g. chunk size in Barrier). */
355+
356+
for(int t = 0; t < XHC_COLLCOUNT; t++) {
357+
mca_coll_xhc_component.op_mca[t] = op_mca_default[t];
358+
}
359+
mca_coll_xhc_component.op_mca_global = op_mca_global_default;
360+
339361
/* Hierarchy */
340362
// ------------
341363

0 commit comments

Comments
 (0)