Skip to content

Commit 81195ab

Browse files
committed
Several fixes related to session directories:
* enable OMPI to retrieve paths from RM through PMIx * cleanups related to tempdirs.
1 parent fb51d65 commit 81195ab

File tree

16 files changed

+337
-543
lines changed

16 files changed

+337
-543
lines changed

orte/mca/ess/base/ess_base_std_app.c

Lines changed: 1 addition & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -136,10 +136,7 @@ int orte_ess_base_app_setup(bool db_restrict_local)
136136
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
137137
(NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base,
138138
orte_process_info.nodename));
139-
if (ORTE_SUCCESS != (ret = orte_session_dir(true,
140-
orte_process_info.tmpdir_base,
141-
orte_process_info.nodename,
142-
ORTE_PROC_MY_NAME))) {
139+
if (ORTE_SUCCESS != (ret = orte_session_dir(true, ORTE_PROC_MY_NAME))) {
143140
ORTE_ERROR_LOG(ret);
144141
error = "orte_session_dir";
145142
goto error;
@@ -149,29 +146,6 @@ int orte_ess_base_app_setup(bool db_restrict_local)
149146
proc-specific session directory. */
150147
opal_output_set_output_file_info(orte_process_info.proc_session_dir,
151148
"output-", NULL, NULL);
152-
/* store the session directory location */
153-
OBJ_CONSTRUCT(&kv, opal_value_t);
154-
kv.key = strdup(OPAL_PMIX_NSDIR);
155-
kv.type = OPAL_STRING;
156-
kv.data.string = strdup(orte_process_info.job_session_dir);
157-
if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_NAME, &kv))) {
158-
ORTE_ERROR_LOG(ret);
159-
OBJ_DESTRUCT(&kv);
160-
error = "opal pmix put job sessiondir";
161-
goto error;
162-
}
163-
OBJ_DESTRUCT(&kv);
164-
OBJ_CONSTRUCT(&kv, opal_value_t);
165-
kv.key = strdup(OPAL_PMIX_PROCDIR);
166-
kv.type = OPAL_STRING;
167-
kv.data.string = strdup(orte_process_info.proc_session_dir);
168-
if (OPAL_SUCCESS != (ret = opal_pmix.store_local(ORTE_PROC_MY_NAME, &kv))) {
169-
ORTE_ERROR_LOG(ret);
170-
OBJ_DESTRUCT(&kv);
171-
error = "opal pmix put proc sessiondir";
172-
goto error;
173-
}
174-
OBJ_DESTRUCT(&kv);
175149
}
176150
/* Setup the communication infrastructure */
177151
/*

orte/mca/ess/base/ess_base_std_orted.c

Lines changed: 4 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -237,10 +237,7 @@ int orte_ess_base_orted_setup(char **hosts)
237237
/* take a pass thru the session directory code to fillin the
238238
* tmpdir names - don't create anything yet
239239
*/
240-
if (ORTE_SUCCESS != (ret = orte_session_dir(false,
241-
orte_process_info.tmpdir_base,
242-
orte_process_info.nodename,
243-
ORTE_PROC_MY_NAME))) {
240+
if (ORTE_SUCCESS != (ret = orte_session_dir(false, ORTE_PROC_MY_NAME))) {
244241
ORTE_ERROR_LOG(ret);
245242
error = "orte_session_dir define";
246243
goto error;
@@ -250,10 +247,7 @@ int orte_ess_base_orted_setup(char **hosts)
250247
*/
251248
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
252249
/* now actually create the directory tree */
253-
if (ORTE_SUCCESS != (ret = orte_session_dir(true,
254-
orte_process_info.tmpdir_base,
255-
orte_process_info.nodename,
256-
ORTE_PROC_MY_NAME))) {
250+
if (ORTE_SUCCESS != (ret = orte_session_dir(true, ORTE_PROC_MY_NAME))) {
257251
ORTE_ERROR_LOG(ret);
258252
error = "orte_session_dir";
259253
goto error;
@@ -277,11 +271,8 @@ int orte_ess_base_orted_setup(char **hosts)
277271
/* define a log file name in the session directory */
278272
snprintf(log_file, PATH_MAX, "output-orted-%s-%s.log",
279273
jobidstring, orte_process_info.nodename);
280-
log_path = opal_os_path(false,
281-
orte_process_info.tmpdir_base,
282-
orte_process_info.top_session_dir,
283-
log_file,
284-
NULL);
274+
log_path = opal_os_path(false, orte_process_info.top_session_dir,
275+
log_file, NULL);
285276

286277
fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640);
287278
if (fd < 0) {

orte/mca/ess/base/ess_base_std_tool.c

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -145,10 +145,9 @@ int orte_ess_base_tool_setup(void)
145145
* tmp base where any other session directories on
146146
* this node might be located
147147
*/
148-
if (ORTE_SUCCESS != (ret = orte_session_dir_get_name(NULL,
149-
&orte_process_info.tmpdir_base,
150-
&orte_process_info.top_session_dir,
151-
orte_process_info.nodename, NULL))) {
148+
149+
ret = orte_session_setup_base(NULL);
150+
if (ORTE_SUCCESS != ret ) {
152151
ORTE_ERROR_LOG(ret);
153152
error = "define session dir names";
154153
goto error;

orte/mca/ess/hnp/ess_hnp_module.c

Lines changed: 15 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ static int rte_init(void)
138138
{
139139
int ret;
140140
char *error = NULL;
141-
char *contact_path, *jobfam_dir;
141+
char *contact_path;
142142
orte_job_t *jdata;
143143
orte_node_t *node;
144144
orte_proc_t *proc;
@@ -294,10 +294,7 @@ static int rte_init(void)
294294
/* take a pass thru the session directory code to fillin the
295295
* tmpdir names - don't create anything yet
296296
*/
297-
if (ORTE_SUCCESS != (ret = orte_session_dir(false,
298-
orte_process_info.tmpdir_base,
299-
orte_process_info.nodename,
300-
ORTE_PROC_MY_NAME))) {
297+
if (ORTE_SUCCESS != (ret = orte_session_dir(false, ORTE_PROC_MY_NAME))) {
301298
error = "orte_session_dir define";
302299
goto error;
303300
}
@@ -307,10 +304,7 @@ static int rte_init(void)
307304
orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);
308305

309306
/* now actually create the directory tree */
310-
if (ORTE_SUCCESS != (ret = orte_session_dir(true,
311-
orte_process_info.tmpdir_base,
312-
orte_process_info.nodename,
313-
ORTE_PROC_MY_NAME))) {
307+
if (ORTE_SUCCESS != (ret = orte_session_dir(true, ORTE_PROC_MY_NAME))) {
314308
error = "orte_session_dir";
315309
goto error;
316310
}
@@ -586,9 +580,12 @@ static int rte_init(void)
586580
opal_output_set_output_file_info(orte_process_info.proc_session_dir,
587581
"output-", NULL, NULL);
588582
/* save my contact info in a file for others to find */
589-
jobfam_dir = opal_dirname(orte_process_info.job_session_dir);
590-
contact_path = opal_os_path(false, jobfam_dir, "contact.txt", NULL);
591-
free(jobfam_dir);
583+
if( NULL == orte_process_info.jobfam_session_dir ){
584+
/* has to be set here! */
585+
ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM);
586+
goto error;
587+
}
588+
contact_path = opal_os_path(false, orte_process_info.jobfam_session_dir, "contact.txt", NULL);
592589
OPAL_OUTPUT_VERBOSE((2, orte_debug_output,
593590
"%s writing contact file %s",
594591
ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
@@ -758,10 +755,9 @@ static int rte_init(void)
758755
true, error, ORTE_ERROR_NAME(ret), ret);
759756
}
760757
/* remove my contact info file, if we have session directories */
761-
if (NULL != orte_process_info.job_session_dir) {
762-
jobfam_dir = opal_dirname(orte_process_info.job_session_dir);
763-
contact_path = opal_os_path(false, jobfam_dir, "contact.txt", NULL);
764-
free(jobfam_dir);
758+
if (NULL != orte_process_info.jobfam_session_dir) {
759+
contact_path = opal_os_path(false, orte_process_info.jobfam_session_dir,
760+
"contact.txt", NULL);
765761
unlink(contact_path);
766762
free(contact_path);
767763
}
@@ -775,7 +771,6 @@ static int rte_init(void)
775771
static int rte_finalize(void)
776772
{
777773
char *contact_path;
778-
char *jobfam_dir;
779774

780775
if (signals_set) {
781776
/* Remove the epipe handler */
@@ -816,10 +811,9 @@ static int rte_finalize(void)
816811
(void) mca_base_framework_close(&opal_pstat_base_framework);
817812

818813
/* remove my contact info file, if we have session directories */
819-
if (NULL != orte_process_info.job_session_dir) {
820-
jobfam_dir = opal_dirname(orte_process_info.job_session_dir);
821-
contact_path = opal_os_path(false, jobfam_dir, "contact.txt", NULL);
822-
free(jobfam_dir);
814+
if (NULL != orte_process_info.jobfam_session_dir) {
815+
contact_path = opal_os_path(false, orte_process_info.jobfam_session_dir,
816+
"contact.txt", NULL);
823817
unlink(contact_path);
824818
free(contact_path);
825819
}

orte/mca/ess/pmi/ess_pmi_module.c

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,43 @@ static int rte_init(void)
242242
free(string_key);
243243
}
244244

245+
/* retrieve temp directories info */
246+
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_TMPDIR, &wildcard_rank, &val, OPAL_STRING);
247+
if (OPAL_SUCCESS == ret && NULL != val) {
248+
/* TODO: who has precedence - pmix of MCA setting??? */
249+
if( NULL == orte_process_info.top_session_dir ){
250+
orte_process_info.top_session_dir = val;
251+
} else {
252+
/* keep the MCA setting */
253+
free(val);
254+
}
255+
val = NULL;
256+
}
257+
258+
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_NSDIR, &wildcard_rank, &val, OPAL_STRING);
259+
if (OPAL_SUCCESS == ret && NULL != val) {
260+
/* TODO: who has precedence - pmix of MCA setting??? */
261+
if( NULL == orte_process_info.job_session_dir ){
262+
orte_process_info.job_session_dir = val;
263+
} else {
264+
/* keep the MCA setting */
265+
free(val);
266+
}
267+
val = NULL;
268+
}
269+
270+
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_PROCDIR, &wildcard_rank, &val, OPAL_STRING);
271+
if (OPAL_SUCCESS == ret && NULL != val) {
272+
/* TODO: who has precedence - pmix of MCA setting??? */
273+
if( NULL == orte_process_info.proc_session_dir ){
274+
orte_process_info.proc_session_dir = val;
275+
} else {
276+
/* keep the MCA setting */
277+
free(val);
278+
}
279+
val = NULL;
280+
}
281+
245282
/* retrieve our topology */
246283
val = NULL;
247284
OPAL_MODEX_RECV_VALUE_OPTIONAL(ret, OPAL_PMIX_LOCAL_TOPO,

orte/mca/filem/raw/filem_raw_module.c

Lines changed: 29 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,17 @@ static void recv_ack(int status, orte_process_name_t* sender,
105105
void* cbdata);
106106
static void write_handler(int fd, short event, void *cbdata);
107107

108+
static char *filem_session_dir()
109+
{
110+
char *session_dir = orte_process_info.jobfam_session_dir;
111+
if( NULL == session_dir ){
112+
/* if no job family session dir was provided -
113+
* use the job session dir */
114+
session_dir = orte_process_info.job_session_dir;
115+
}
116+
return session_dir;
117+
}
118+
108119
static int raw_init(void)
109120
{
110121
OBJ_CONSTRUCT(&incoming_files, opal_list_t);
@@ -657,25 +668,26 @@ static int create_link(char *my_dir, char *path,
657668
static int raw_link_local_files(orte_job_t *jdata,
658669
orte_app_context_t *app)
659670
{
660-
char *my_dir, *path=NULL;
671+
char *session_dir, *path=NULL;
661672
orte_proc_t *proc;
662-
char *prefix;
663673
int i, j, rc;
664674
orte_filem_raw_incoming_t *inbnd;
665675
opal_list_item_t *item;
666676
char **files=NULL, *bname, *filestring;
667677

668-
/* check my session directory for files I have received and
678+
/* check my jobfam session directory for files I have received and
669679
* symlink them to the proc-level session directory of each
670680
* local process in the job
681+
*
682+
* TODO: @rhc - please check that I've correctly interpret your
683+
* intention here
671684
*/
672-
my_dir = opal_dirname(orte_process_info.job_session_dir);
673-
674-
/* setup */
675-
if (NULL != orte_process_info.tmpdir_base) {
676-
prefix = strdup(orte_process_info.tmpdir_base);
677-
} else {
678-
prefix = NULL;
685+
session_dir = filem_session_dir();
686+
if( NULL == session_dir){
687+
/* we were unable to find any suitable directory */
688+
rc = ORTE_ERR_BAD_PARAM;
689+
ORTE_ERROR_LOG(rc);
690+
return rc;
679691
}
680692

681693
/* get the list of files this app wants */
@@ -692,10 +704,6 @@ static int raw_link_local_files(orte_job_t *jdata,
692704

693705
/* if there are no files to link, then ignore this */
694706
if (NULL == files) {
695-
free(my_dir);
696-
if (NULL != prefix) {
697-
free(prefix);
698-
}
699707
return ORTE_SUCCESS;
700708
}
701709

@@ -736,22 +744,15 @@ static int raw_link_local_files(orte_job_t *jdata,
736744
ORTE_NAME_PRINT(&proc->name)));
737745

738746
/* get the session dir name in absolute form */
739-
path = NULL;
740-
rc = orte_session_dir_get_name(&path, &prefix, NULL,
741-
orte_process_info.nodename,
742-
&proc->name);
747+
path = orte_process_info.proc_session_dir;
748+
743749
/* create it, if it doesn't already exist */
744750
if (OPAL_SUCCESS != (rc = opal_os_dirpath_create(path, S_IRWXU))) {
745751
ORTE_ERROR_LOG(rc);
746752
/* doesn't exist with correct permissions, and/or we can't
747753
* create it - either way, we are done
748754
*/
749755
free(files);
750-
if (NULL != prefix) {
751-
free(prefix);
752-
}
753-
free(path);
754-
free(my_dir);
755756
return rc;
756757
}
757758

@@ -775,13 +776,8 @@ static int raw_link_local_files(orte_job_t *jdata,
775776
inbnd->file));
776777
/* cycle thru the link points and create symlinks to them */
777778
for (j=0; NULL != inbnd->link_pts[j]; j++) {
778-
if (ORTE_SUCCESS != (rc = create_link(my_dir, path, inbnd->link_pts[j]))) {
779+
if (ORTE_SUCCESS != (rc = create_link(session_dir, path, inbnd->link_pts[j]))) {
779780
ORTE_ERROR_LOG(rc);
780-
free(my_dir);
781-
free(path);
782-
if (NULL != prefix) {
783-
free(prefix);
784-
}
785781
free(files);
786782
return rc;
787783
}
@@ -796,13 +792,8 @@ static int raw_link_local_files(orte_job_t *jdata,
796792
}
797793
}
798794
}
799-
free(path);
800795
}
801796
opal_argv_free(files);
802-
if (NULL != prefix) {
803-
free(prefix);
804-
}
805-
free(my_dir);
806797
return ORTE_SUCCESS;
807798
}
808799

@@ -999,7 +990,7 @@ static void recv_files(int status, orte_process_name_t* sender,
999990
opal_buffer_t* buffer, orte_rml_tag_t tag,
1000991
void* cbdata)
1001992
{
1002-
char *file, *jobfam_dir;
993+
char *file, *session_dir;
1003994
int32_t nchunk, n, nbytes;
1004995
unsigned char data[ORTE_FILEM_RAW_CHUNK_MAX];
1005996
int rc;
@@ -1086,9 +1077,9 @@ static void recv_files(int status, orte_process_name_t* sender,
10861077
incoming->top = strdup(tmp);
10871078
free(tmp);
10881079
/* define the full path to where we will put it */
1089-
jobfam_dir = opal_dirname(orte_process_info.job_session_dir);
1090-
incoming->fullpath = opal_os_path(false, jobfam_dir, file, NULL);
1091-
free(jobfam_dir);
1080+
session_dir = filem_session_dir();
1081+
1082+
incoming->fullpath = opal_os_path(false, session_dir, file, NULL);
10921083

10931084
OPAL_OUTPUT_VERBOSE((1, orte_filem_base_framework.framework_output,
10941085
"%s filem:raw: opening target file %s",

orte/mca/oob/usock/oob_usock_component.c

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -164,8 +164,7 @@ static int component_available(void)
164164

165165
/* if session directories were forbidden, then we cannot be used */
166166
if (!orte_create_session_dirs ||
167-
NULL == orte_process_info.tmpdir_base ||
168-
NULL == orte_process_info.top_session_dir) {
167+
NULL == orte_process_info.jobfam_session_dir ) {
169168
return ORTE_ERR_NOT_SUPPORTED;
170169
}
171170

@@ -216,9 +215,7 @@ static int component_startup(void)
216215
/* setup the path to the daemon rendezvous point */
217216
memset(&mca_oob_usock_component.address, 0, sizeof(struct sockaddr_un));
218217
mca_oob_usock_component.address.sun_family = AF_UNIX;
219-
session = opal_os_path(false, orte_process_info.tmpdir_base,
220-
orte_process_info.top_session_dir,
221-
orte_process_info.jobfam_session_dir,
218+
session = opal_os_path(false, orte_process_info.jobfam_session_dir,
222219
"usock", NULL);
223220
if ((strlen(session) + 1) > sizeof(mca_oob_usock_component.address.sun_path)-1) {
224221
opal_output(0, "SESSION DIR TOO LONG");

0 commit comments

Comments
 (0)