Skip to content

Commit 8d1be27

Browse files
author
Ralph Castain
committed
Deal with special case during cleanup
In some scenarios, we can have a daemon sharing the node with mpirun. In those cases, we need to avoid race conditions in cleanup Signed-off-by: Ralph Castain <[email protected]>
1 parent a0ea197 commit 8d1be27

File tree

1 file changed

+39
-15
lines changed

1 file changed

+39
-15
lines changed

orte/util/session_dir.c

Lines changed: 39 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* Copyright (c) 2014 Cisco Systems, Inc. All rights reserved.
1313
* Copyright (c) 2015 Research Organization for Information Science
1414
* and Technology (RIST). All rights reserved.
15-
* Copyright (c) 2015-2017 Intel, Inc. All rights reserved.
15+
* Copyright (c) 2015-2018 Intel, Inc. All rights reserved.
1616
* $COPYRIGHT$
1717
*
1818
* Additional copyrights may follow
@@ -61,6 +61,7 @@
6161
#include "orte/util/show_help.h"
6262

6363
#include "orte/mca/errmgr/errmgr.h"
64+
#include "orte/mca/ras/base/base.h"
6465
#include "orte/runtime/runtime.h"
6566
#include "orte/runtime/orte_globals.h"
6667

@@ -370,6 +371,16 @@ int orte_session_dir(bool create, orte_process_name_t *proc)
370371
int
371372
orte_session_dir_cleanup(orte_jobid_t jobid)
372373
{
374+
/* special case - if a daemon is colocated with mpirun,
375+
* then we let mpirun do the rest to avoid a race
376+
* condition. this scenario always results in the rank=1
377+
* daemon colocated with mpirun */
378+
if (orte_ras_base.launch_orted_on_hn &&
379+
ORTE_PROC_IS_DAEMON &&
380+
1 == ORTE_PROC_MY_NAME->vpid) {
381+
return ORTE_SUCCESS;
382+
}
383+
373384
if (!orte_create_session_dirs || orte_process_info.rm_session_dirs ) {
374385
/* we haven't created them or RM will clean them up for us*/
375386
return ORTE_SUCCESS;
@@ -386,6 +397,7 @@ orte_session_dir_cleanup(orte_jobid_t jobid)
386397
return ORTE_ERR_NOT_INITIALIZED;
387398
}
388399

400+
389401
/* recursively blow the whole session away for our job family,
390402
* saving only output files
391403
*/
@@ -461,20 +473,6 @@ orte_session_dir_finalize(orte_process_name_t *proc)
461473

462474
opal_os_dirpath_destroy(orte_process_info.proc_session_dir,
463475
false, orte_dir_check_file);
464-
opal_os_dirpath_destroy(orte_process_info.job_session_dir,
465-
false, orte_dir_check_file);
466-
/* only remove the jobfam session dir if we are the
467-
* local daemon and we are finalizing our own session dir */
468-
if ((ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) &&
469-
(ORTE_PROC_MY_NAME == proc)) {
470-
opal_os_dirpath_destroy(orte_process_info.jobfam_session_dir,
471-
false, orte_dir_check_file);
472-
}
473-
474-
if( NULL != orte_process_info.top_session_dir ){
475-
opal_os_dirpath_destroy(orte_process_info.top_session_dir,
476-
false, orte_dir_check_file);
477-
}
478476

479477
if (opal_os_dirpath_is_empty(orte_process_info.proc_session_dir)) {
480478
if (orte_debug_flag) {
@@ -492,6 +490,32 @@ orte_session_dir_finalize(orte_process_name_t *proc)
492490
}
493491
}
494492

493+
/* special case - if a daemon is colocated with mpirun,
494+
* then we let mpirun do the rest to avoid a race
495+
* condition. this scenario always results in the rank=1
496+
* daemon colocated with mpirun */
497+
if (orte_ras_base.launch_orted_on_hn &&
498+
ORTE_PROC_IS_DAEMON &&
499+
1 == ORTE_PROC_MY_NAME->vpid) {
500+
return ORTE_SUCCESS;
501+
}
502+
503+
opal_os_dirpath_destroy(orte_process_info.job_session_dir,
504+
false, orte_dir_check_file);
505+
506+
/* only remove the jobfam session dir if we are the
507+
* local daemon and we are finalizing our own session dir */
508+
if ((ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) &&
509+
(ORTE_PROC_MY_NAME == proc)) {
510+
opal_os_dirpath_destroy(orte_process_info.jobfam_session_dir,
511+
false, orte_dir_check_file);
512+
}
513+
514+
if( NULL != orte_process_info.top_session_dir ){
515+
opal_os_dirpath_destroy(orte_process_info.top_session_dir,
516+
false, orte_dir_check_file);
517+
}
518+
495519
if (opal_os_dirpath_is_empty(orte_process_info.job_session_dir)) {
496520
if (orte_debug_flag) {
497521
opal_output(0, "sess_dir_finalize: found job session dir empty - deleting");

0 commit comments

Comments
 (0)