Skip to content

MTL OFI: Fix race condition due to global progress entries array #5622

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 31, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions ompi/mca/mtl/ofi/mtl_ofi.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,23 +59,23 @@ ompi_mtl_ofi_progress(void)
int count = 0, i, events_read;
struct fi_cq_err_entry error = { 0 };
ompi_mtl_ofi_request_t *ofi_req = NULL;
struct fi_cq_tagged_entry wc[ompi_mtl_ofi.ofi_progress_event_count];

/**
* Read the work completions from the CQ.
* From the completion's op_context, we get the associated OFI request.
* Call the request's callback.
*/
while (true) {
ret = fi_cq_read(ompi_mtl_ofi.cq, ompi_mtl_ofi.progress_entries,
ompi_mtl_ofi.ofi_progress_event_count);
ret = fi_cq_read(ompi_mtl_ofi.cq, (void *)&wc, ompi_mtl_ofi.ofi_progress_event_count);
if (ret > 0) {
count+= ret;
events_read = ret;
for (i = 0; i < events_read; i++) {
if (NULL != ompi_mtl_ofi.progress_entries[i].op_context) {
ofi_req = TO_OFI_REQ(ompi_mtl_ofi.progress_entries[i].op_context);
if (NULL != wc[i].op_context) {
ofi_req = TO_OFI_REQ(wc[i].op_context);
assert(ofi_req);
ret = ofi_req->event_callback(&ompi_mtl_ofi.progress_entries[i], ofi_req);
ret = ofi_req->event_callback(&wc[i], ofi_req);
if (OMPI_SUCCESS != ret) {
opal_output(0, "%s:%d: Error returned by request event callback: %zd.\n"
"*** The Open MPI OFI MTL is aborting the MPI job (via exit(3)).\n",
Expand Down
20 changes: 0 additions & 20 deletions ompi/mca/mtl/ofi/mtl_ofi_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -663,21 +663,6 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
goto error;
}

/**
* Allocate memory for storing the CQ events read in OFI progress.
*/
ompi_mtl_ofi.progress_entries = calloc(ompi_mtl_ofi.ofi_progress_event_count, sizeof(struct fi_cq_tagged_entry));
if (NULL == ompi_mtl_ofi.progress_entries) {
opal_output_verbose(1, ompi_mtl_base_framework.framework_output,
"%s:%d: alloc of CQ event storage failed: %s\n",
__FILE__, __LINE__, strerror(errno));
goto error;
}

/**
* The remote fi_addr will be stored in the ofi_endpoint struct.
*/

av_attr.type = (MTL_OFI_AV_TABLE == av_type) ? FI_AV_TABLE: FI_AV_MAP;

ret = fi_av_open(ompi_mtl_ofi.domain, &av_attr, &ompi_mtl_ofi.av, NULL);
Expand Down Expand Up @@ -799,9 +784,6 @@ ompi_mtl_ofi_component_init(bool enable_progress_threads,
if (ompi_mtl_ofi.fabric) {
(void) fi_close((fid_t)ompi_mtl_ofi.fabric);
}
if (ompi_mtl_ofi.progress_entries) {
free(ompi_mtl_ofi.progress_entries);
}

return NULL;
}
Expand Down Expand Up @@ -834,8 +816,6 @@ ompi_mtl_ofi_finalize(struct mca_mtl_base_module_t *mtl)
goto finalize_err;
}

free(ompi_mtl_ofi.progress_entries);

return OMPI_SUCCESS;

finalize_err:
Expand Down
3 changes: 0 additions & 3 deletions ompi/mca/mtl/ofi/mtl_ofi_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,6 @@ typedef struct mca_mtl_ofi_module_t {
/** Maximum number of CQ events to read in OFI Progress */
int ofi_progress_event_count;

/** CQ event storage */
struct fi_cq_tagged_entry *progress_entries;

/** Use FI_REMOTE_CQ_DATA*/
bool fi_cq_data;

Expand Down