Skip to content

Commit 332d633

Browse files
committed
stacktrace: Add flexibility in stacktrace ouptut
- New MCA option: opal_stacktrace_output - Specifies where the stack trace output stream goes. - Accepts: none, stdout, stderr, file[:filename] - Default filename 'stacktrace' - Filename will be `stacktrace.PID`, or if VPID is available, then the filename will be `stacktrace.VPID.PID` - Update util/stacktrace to allow for different output avenues including files. Previously this was hardcoded to 'stderr'. - Since opal_backtrace_print needs to be signal safe, passing it a FILE object that actually represents a file stream is difficult. This is because we cannot open the file in the signal handler using `fopen` (not safe), but have to use `open` (safe). Additionally, we cannot use `fdopen` to convert the `int fd` to a `FILE *fh` since it is also not signal safe. - I did not want to break the backtrace.h API so I introduced a new rule (documented in `backtrace.c`) that if the `FILE *file` argument is `NULL` then look for the `opal_stacktrace_output_fileno` variable to tell you which file descriptor to use for output. Signed-off-by: Joshua Hursey <[email protected]>
1 parent 0b115a7 commit 332d633

File tree

7 files changed

+194
-11
lines changed

7 files changed

+194
-11
lines changed

opal/mca/backtrace/backtrace.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
1414
* reserved.
15+
* Copyright (c) 2017 IBM Corporation. All rights reserved.
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -26,6 +27,7 @@
2627

2728
#include "opal/mca/mca.h"
2829
#include "opal/mca/base/base.h"
30+
#include "opal/util/stacktrace.h"
2931

3032
BEGIN_C_DECLS
3133

@@ -39,6 +41,8 @@ BEGIN_C_DECLS
3941
/*
4042
* Print back trace to FILE file with a prefix for each line.
4143
* First strip lines are not printed.
44+
* If 'file' is NULL then the component should try to use the file descriptor
45+
* saved in opal_stacktrace_output_fileno
4246
*
4347
* \note some attempts made to be signal safe.
4448
*/

opal/mca/backtrace/execinfo/backtrace_execinfo.c

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
* Copyright (c) 2004-2006 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
13+
* Copyright (c) 2017 IBM Corporation. All rights reserved.
1314
* $COPYRIGHT$
1415
*
1516
* Additional copyrights may follow
@@ -34,12 +35,16 @@
3435
int
3536
opal_backtrace_print(FILE *file, char *prefix, int strip)
3637
{
37-
int i, fd, len;
38+
int i, len;
3839
int trace_size;
3940
void * trace[32];
4041
char buf[6];
42+
int fd = opal_stacktrace_output_fileno;
43+
44+
if( NULL != file ) {
45+
fd = fileno(file);
46+
}
4147

42-
fd = fileno (file);
4348
if (-1 == fd) {
4449
return OPAL_ERR_BAD_PARAM;
4550
}

opal/mca/backtrace/printstack/backtrace_printstack.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
* Copyright (c) 2004-2006 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
13+
* Copyright (c) 2017 IBM Corporation. All rights reserved.
1314
* $COPYRIGHT$
1415
*
1516
* Additional copyrights may follow
@@ -27,7 +28,13 @@
2728
int
2829
opal_backtrace_print(FILE *file, char *prefix, int strip)
2930
{
30-
printstack(fileno(file));
31+
int fd = opal_stacktrace_output_fileno;
32+
33+
if( NULL != file ) {
34+
fd = fileno(file);
35+
}
36+
37+
printstack(fd);
3138

3239
return OPAL_SUCCESS;
3340
}

opal/runtime/opal_params.c

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
* and Technology (RIST). All rights reserved.
2222
* Copyright (c) 2015 Mellanox Technologies, Inc.
2323
* All rights reserved.
24+
* Copyright (c) 2017 IBM Corporation. All rights reserved.
2425
* $COPYRIGHT$
2526
*
2627
* Additional copyrights may follow
@@ -48,6 +49,7 @@
4849
#include "opal/util/timings.h"
4950

5051
char *opal_signal_string = NULL;
52+
char *opal_stacktrace_output_filename = NULL;
5153
char *opal_net_private_ipv4 = NULL;
5254
char *opal_set_max_sys_limits = NULL;
5355

@@ -76,6 +78,7 @@ static bool opal_register_done = false;
7678
int opal_register_params(void)
7779
{
7880
int ret;
81+
char *string = NULL;
7982

8083
if (opal_register_done) {
8184
return OPAL_SUCCESS;
@@ -87,7 +90,6 @@ int opal_register_params(void)
8790
* This string is going to be used in opal/util/stacktrace.c
8891
*/
8992
{
90-
char *string = NULL;
9193
int j;
9294
int signals[] = {
9395
#ifdef SIGABRT
@@ -127,6 +129,28 @@ int opal_register_params(void)
127129
}
128130
}
129131

132+
/*
133+
* Where should the stack trace output be directed
134+
* This string is going to be used in opal/util/stacktrace.c
135+
*/
136+
string = strdup("stderr");
137+
opal_stacktrace_output_filename = string;
138+
ret = mca_base_var_register ("opal", "opal", NULL, "stacktrace_output",
139+
"Specifies where the stack trace output stream goes. "
140+
"Accepts one of the following: none (disabled), stderr (default), stdout, file[:filename]. "
141+
"If 'filename' is not specified, a default filename of 'stacktrace' is used. "
142+
"The 'filename' is appended with either '.PID' or '.RANK.PID', if RANK is available. "
143+
"The 'filename' can be an absolute path or a relative path to the current working directory.",
144+
MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
145+
OPAL_INFO_LVL_3,
146+
MCA_BASE_VAR_SCOPE_LOCAL,
147+
&opal_stacktrace_output_filename);
148+
free (string);
149+
if (0 > ret) {
150+
return ret;
151+
}
152+
153+
130154
#if defined(HAVE_SCHED_YIELD)
131155
opal_progress_yield_when_idle = false;
132156
ret = mca_base_var_register ("opal", "opal", "progress", "yield_when_idle",

opal/runtime/opal_params.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
1919
* Copyright (c) 2015 Mellanox Technologies, Inc.
2020
* All rights reserved.
21+
* Copyright (c) 2017 IBM Corporation. All rights reserved.
2122
* $COPYRIGHT$
2223
*
2324
* Additional copyrights may follow
@@ -29,6 +30,7 @@
2930
#define OPAL_PARAMS_H
3031

3132
extern char *opal_signal_string;
33+
extern char *opal_stacktrace_output_filename;
3234
extern char *opal_net_private_ipv4;
3335
extern char *opal_set_max_sys_limits;
3436

opal/util/stacktrace.c

Lines changed: 141 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,15 @@
2525
#ifdef HAVE_UNISTD_H
2626
#include <unistd.h>
2727
#endif
28+
#ifdef HAVE_SYS_TYPES_H
29+
#include <sys/types.h>
30+
#endif
31+
#ifdef HAVE_SYS_STAT_H
32+
#include <sys/stat.h>
33+
#endif
34+
#ifdef HAVE_SYS_FCNTL_H
35+
#include <fcntl.h>
36+
#endif
2837

2938
#include <string.h>
3039
#include <signal.h>
@@ -35,6 +44,7 @@
3544
#include "opal/util/output.h"
3645
#include "opal/util/show_help.h"
3746
#include "opal/util/argv.h"
47+
#include "opal/util/proc.h"
3848
#include "opal/runtime/opal_params.h"
3949

4050
#ifndef _NSIG
@@ -43,9 +53,35 @@
4353

4454
#define HOSTFORMAT "[%s:%05d] "
4555

56+
int opal_stacktrace_output_fileno = -1;
57+
static char *opal_stacktrace_output_filename_base = NULL;
58+
static size_t opal_stacktrace_output_filename_max_len = 0;
4659
static char stacktrace_hostname[OPAL_MAXHOSTNAMELEN];
4760
static char *unable_to_print_msg = "Unable to print stack trace!\n";
4861

62+
/*
63+
* Set the stacktrace filename:
64+
* stacktrace.PID
65+
* -or, if VPID is available-
66+
* stacktrace.VPID.PID
67+
*/
68+
static void set_stacktrace_filename(void) {
69+
opal_proc_t *my_proc = opal_proc_local_get();
70+
71+
if( NULL == my_proc ) {
72+
snprintf(opal_stacktrace_output_filename, opal_stacktrace_output_filename_max_len,
73+
"%s.%lu",
74+
opal_stacktrace_output_filename_base, (unsigned long)getpid());
75+
}
76+
else {
77+
snprintf(opal_stacktrace_output_filename, opal_stacktrace_output_filename_max_len,
78+
"%s.%lu.%lu",
79+
opal_stacktrace_output_filename_base, (unsigned long)my_proc->proc_name.vpid, (unsigned long)getpid());
80+
}
81+
82+
return;
83+
}
84+
4985
/**
5086
* This function is being called as a signal-handler in response
5187
* to a user-specified signal (e.g. SIGFPE or SIGSEGV).
@@ -69,12 +105,37 @@ static void show_stackframe (int signo, siginfo_t * info, void * p)
69105
int ret;
70106
char *si_code_str = "";
71107

108+
/* Do not print the stack trace */
109+
if( 0 > opal_stacktrace_output_fileno && 0 == opal_stacktrace_output_filename_max_len ) {
110+
/* Raise the signal again, so we don't accidentally mask critical signals.
111+
* For critical signals, it is preferred that we call 'raise' instead of
112+
* 'exit' or 'abort' so that the return status is set properly for this
113+
* process.
114+
*/
115+
signal(signo, SIG_DFL);
116+
raise(signo);
117+
118+
return;
119+
}
120+
121+
/* Update the file name with the RANK, if available */
122+
if( 0 < opal_stacktrace_output_filename_max_len ) {
123+
set_stacktrace_filename();
124+
opal_stacktrace_output_fileno = open(opal_stacktrace_output_filename,
125+
O_CREAT|O_WRONLY|O_TRUNC, S_IRUSR|S_IWUSR);
126+
if( 0 > opal_stacktrace_output_fileno ) {
127+
opal_output(0, "Error: Failed to open the stacktrace output file. Default: stderr\n\tFilename: %s\n\tErrno: %s",
128+
opal_stacktrace_output_filename, strerror(errno));
129+
opal_stacktrace_output_fileno = fileno(stderr);
130+
}
131+
}
132+
72133
/* write out the footer information */
73134
memset (print_buffer, 0, sizeof (print_buffer));
74135
ret = snprintf(print_buffer, sizeof(print_buffer),
75136
HOSTFORMAT "*** Process received signal ***\n",
76137
stacktrace_hostname, getpid());
77-
write(fileno(stderr), print_buffer, ret);
138+
write(opal_stacktrace_output_fileno, print_buffer, ret);
78139

79140

80141
memset (print_buffer, 0, sizeof (print_buffer));
@@ -324,14 +385,14 @@ static void show_stackframe (int signo, siginfo_t * info, void * p)
324385
}
325386

326387
/* write out the signal information generated above */
327-
write(fileno(stderr), print_buffer, sizeof(print_buffer)-size);
388+
write(opal_stacktrace_output_fileno, print_buffer, sizeof(print_buffer)-size);
328389

329390
/* print out the stack trace */
330391
snprintf(print_buffer, sizeof(print_buffer), HOSTFORMAT,
331392
stacktrace_hostname, getpid());
332-
ret = opal_backtrace_print(stderr, print_buffer, 2);
393+
ret = opal_backtrace_print(NULL, print_buffer, 2);
333394
if (OPAL_SUCCESS != ret) {
334-
write(fileno(stderr), unable_to_print_msg, strlen(unable_to_print_msg));
395+
write(opal_stacktrace_output_fileno, unable_to_print_msg, strlen(unable_to_print_msg));
335396
}
336397

337398
/* write out the footer information */
@@ -340,9 +401,15 @@ static void show_stackframe (int signo, siginfo_t * info, void * p)
340401
HOSTFORMAT "*** End of error message ***\n",
341402
stacktrace_hostname, getpid());
342403
if (ret > 0) {
343-
write(fileno(stderr), print_buffer, ret);
404+
write(opal_stacktrace_output_fileno, print_buffer, ret);
344405
} else {
345-
write(fileno(stderr), unable_to_print_msg, strlen(unable_to_print_msg));
406+
write(opal_stacktrace_output_fileno, unable_to_print_msg, strlen(unable_to_print_msg));
407+
}
408+
409+
if( fileno(stdout) != opal_stacktrace_output_fileno &&
410+
fileno(stderr) != opal_stacktrace_output_fileno ) {
411+
close(opal_stacktrace_output_fileno);
412+
opal_stacktrace_output_fileno = -1;
346413
}
347414

348415
/* Raise the signal again, so we don't accidentally mask critical signals.
@@ -372,7 +439,30 @@ void opal_stackframe_output(int stream)
372439
opal_output(stream, "%s", traces[i]);
373440
}
374441
} else {
375-
opal_backtrace_print(stderr, NULL, 2);
442+
/* Do not print the stack trace */
443+
if( 0 > opal_stacktrace_output_fileno && 0 == opal_stacktrace_output_filename_max_len ) {
444+
return;
445+
}
446+
447+
/* Update the file name with the RANK, if available */
448+
if( 0 < opal_stacktrace_output_filename_max_len ) {
449+
set_stacktrace_filename();
450+
opal_stacktrace_output_fileno = open(opal_stacktrace_output_filename,
451+
O_CREAT|O_WRONLY|O_TRUNC, S_IRUSR|S_IWUSR);
452+
if( 0 > opal_stacktrace_output_fileno ) {
453+
opal_output(0, "Error: Failed to open the stacktrace output file. Default: stderr\n\tFilename: %s\n\tErrno: %s",
454+
opal_stacktrace_output_filename, strerror(errno));
455+
opal_stacktrace_output_fileno = fileno(stderr);
456+
}
457+
}
458+
459+
opal_backtrace_print(NULL, NULL, 2);
460+
461+
if( fileno(stdout) != opal_stacktrace_output_fileno &&
462+
fileno(stderr) != opal_stacktrace_output_fileno ) {
463+
close(opal_stacktrace_output_fileno);
464+
opal_stacktrace_output_fileno = -1;
465+
}
376466
}
377467
}
378468

@@ -443,6 +533,50 @@ int opal_util_register_stackhandlers (void)
443533
}
444534
}
445535

536+
/* Setup the output stream to use */
537+
if( NULL == opal_stacktrace_output_filename ||
538+
0 == strcasecmp(opal_stacktrace_output_filename, "none") ) {
539+
opal_stacktrace_output_fileno = -1;
540+
}
541+
else if( 0 == strcasecmp(opal_stacktrace_output_filename, "stdout") ) {
542+
opal_stacktrace_output_fileno = fileno(stdout);
543+
}
544+
else if( 0 == strcasecmp(opal_stacktrace_output_filename, "stderr") ) {
545+
opal_stacktrace_output_fileno = fileno(stdout);
546+
}
547+
else if( 0 == strcasecmp(opal_stacktrace_output_filename, "file" ) ||
548+
0 == strcasecmp(opal_stacktrace_output_filename, "file:") ) {
549+
opal_stacktrace_output_filename_base = strdup("stacktrace");
550+
551+
free(opal_stacktrace_output_filename);
552+
// Magic number: 8 = space for .PID and .RANK (allow 7 digits each)
553+
opal_stacktrace_output_filename_max_len = strlen("stacktrace") + 8 + 8;
554+
opal_stacktrace_output_filename = (char*)malloc(sizeof(char) * opal_stacktrace_output_filename_max_len);
555+
set_stacktrace_filename();
556+
opal_stacktrace_output_fileno = -1;
557+
}
558+
else if( 0 == strncasecmp(opal_stacktrace_output_filename, "file:", 5) ) {
559+
char *filename_cpy = NULL;
560+
next = strchr(opal_stacktrace_output_filename, ':');
561+
next++; // move past the ':' to the filename specified
562+
563+
opal_stacktrace_output_filename_base = strdup(next);
564+
565+
free(opal_stacktrace_output_filename);
566+
// Magic number: 8 = space for .PID and .RANK (allow 7 digits each)
567+
opal_stacktrace_output_filename_max_len = strlen(opal_stacktrace_output_filename_base) + 8 + 8;
568+
opal_stacktrace_output_filename = (char*)malloc(sizeof(char) * opal_stacktrace_output_filename_max_len);
569+
set_stacktrace_filename();
570+
opal_stacktrace_output_fileno = -1;
571+
572+
free(filename_cpy);
573+
}
574+
else {
575+
opal_stacktrace_output_fileno = fileno(stderr);
576+
}
577+
578+
579+
/* Setup the signals to catch */
446580
memset(&act, 0, sizeof(act));
447581
act.sa_sigaction = show_stackframe;
448582
act.sa_flags = SA_SIGINFO;

opal/util/stacktrace.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
* Copyright (c) 2004-2005 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2008 Cisco Systems, Inc. All rights reserved.
13+
* Copyright (c) 2017 IBM Corporation. All rights reserved.
1314
* $COPYRIGHT$
1415
*
1516
* Additional copyrights may follow
@@ -24,6 +25,12 @@
2425

2526
#include "opal_config.h"
2627

28+
/*
29+
* File descriptor to be used by the backtrace framework if opal_backtrace_print
30+
* is passed NULL for it's FILE file pointer.
31+
*/
32+
extern int opal_stacktrace_output_fileno;
33+
2734
/**
2835
* Output the current stack trace (not including the call to this
2936
* function) to the stream indicated.

0 commit comments

Comments
 (0)