Skip to content

Commit 938e12c

Browse files
authored
Merge pull request #2856 from jjhursey/topic/ibm/v2.x/stacktrace-improv
v2.x: opal/stacktrace improvements
2 parents 18a0219 + eab30b0 commit 938e12c

File tree

7 files changed

+203
-11
lines changed

7 files changed

+203
-11
lines changed

opal/mca/backtrace/backtrace.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2015 Los Alamos National Security, LLC. All rights
1414
* reserved.
15+
* Copyright (c) 2017 IBM Corporation. All rights reserved.
1516
* $COPYRIGHT$
1617
*
1718
* Additional copyrights may follow
@@ -26,6 +27,7 @@
2627

2728
#include "opal/mca/mca.h"
2829
#include "opal/mca/base/base.h"
30+
#include "opal/util/stacktrace.h"
2931

3032
BEGIN_C_DECLS
3133

@@ -39,6 +41,8 @@ BEGIN_C_DECLS
3941
/*
4042
* Print back trace to FILE file with a prefix for each line.
4143
* First strip lines are not printed.
44+
* If 'file' is NULL then the component should try to use the file descriptor
45+
* saved in opal_stacktrace_output_fileno
4246
*
4347
* \note some attempts made to be signal safe.
4448
*/

opal/mca/backtrace/execinfo/backtrace_execinfo.c

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
* Copyright (c) 2004-2006 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2011 Cisco Systems, Inc. All rights reserved.
13+
* Copyright (c) 2017 IBM Corporation. All rights reserved.
1314
* $COPYRIGHT$
1415
*
1516
* Additional copyrights may follow
@@ -34,12 +35,16 @@
3435
int
3536
opal_backtrace_print(FILE *file, char *prefix, int strip)
3637
{
37-
int i, fd, len;
38+
int i, len;
3839
int trace_size;
3940
void * trace[32];
4041
char buf[6];
42+
int fd = opal_stacktrace_output_fileno;
43+
44+
if( NULL != file ) {
45+
fd = fileno(file);
46+
}
4147

42-
fd = fileno (file);
4348
if (-1 == fd) {
4449
return OPAL_ERR_BAD_PARAM;
4550
}

opal/mca/backtrace/printstack/backtrace_printstack.c

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
* Copyright (c) 2004-2006 The Regents of the University of California.
1111
* All rights reserved.
1212
* Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
13+
* Copyright (c) 2017 IBM Corporation. All rights reserved.
1314
* $COPYRIGHT$
1415
*
1516
* Additional copyrights may follow
@@ -27,7 +28,13 @@
2728
int
2829
opal_backtrace_print(FILE *file, char *prefix, int strip)
2930
{
30-
printstack(fileno(file));
31+
int fd = opal_stacktrace_output_fileno;
32+
33+
if( NULL != file ) {
34+
fd = fileno(file);
35+
}
36+
37+
printstack(fd);
3138

3239
return OPAL_SUCCESS;
3340
}

opal/runtime/opal_params.c

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
* and Technology (RIST). All rights reserved.
2222
* Copyright (c) 2015 Mellanox Technologies, Inc.
2323
* All rights reserved.
24+
* Copyright (c) 2017 IBM Corporation. All rights reserved.
2425
* $COPYRIGHT$
2526
*
2627
* Additional copyrights may follow
@@ -48,6 +49,7 @@
4849
#include "opal/util/bit_ops.h"
4950

5051
char *opal_signal_string = NULL;
52+
char *opal_stacktrace_output_filename = NULL;
5153
char *opal_net_private_ipv4 = NULL;
5254
char *opal_set_max_sys_limits = NULL;
5355

@@ -74,6 +76,7 @@ static bool opal_register_done = false;
7476
int opal_register_params(void)
7577
{
7678
int ret;
79+
char *string = NULL;
7780

7881
if (opal_register_done) {
7982
return OPAL_SUCCESS;
@@ -85,7 +88,6 @@ int opal_register_params(void)
8588
* This string is going to be used in opal/util/stacktrace.c
8689
*/
8790
{
88-
char *string = NULL;
8991
int j;
9092
int signals[] = {
9193
#ifdef SIGABRT
@@ -125,6 +127,28 @@ int opal_register_params(void)
125127
}
126128
}
127129

130+
/*
131+
* Where should the stack trace output be directed
132+
* This string is going to be used in opal/util/stacktrace.c
133+
*/
134+
string = strdup("stderr");
135+
opal_stacktrace_output_filename = string;
136+
ret = mca_base_var_register ("opal", "opal", NULL, "stacktrace_output",
137+
"Specifies where the stack trace output stream goes. "
138+
"Accepts one of the following: none (disabled), stderr (default), stdout, file[:filename]. "
139+
"If 'filename' is not specified, a default filename of 'stacktrace' is used. "
140+
"The 'filename' is appended with either '.PID' or '.RANK.PID', if RANK is available. "
141+
"The 'filename' can be an absolute path or a relative path to the current working directory.",
142+
MCA_BASE_VAR_TYPE_STRING, NULL, 0, MCA_BASE_VAR_FLAG_SETTABLE,
143+
OPAL_INFO_LVL_3,
144+
MCA_BASE_VAR_SCOPE_LOCAL,
145+
&opal_stacktrace_output_filename);
146+
free (string);
147+
if (0 > ret) {
148+
return ret;
149+
}
150+
151+
128152
#if defined(HAVE_SCHED_YIELD)
129153
opal_progress_yield_when_idle = false;
130154
ret = mca_base_var_register ("opal", "opal", "progress", "yield_when_idle",

opal/runtime/opal_params.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
* Copyright (c) 2014 Hochschule Esslingen. All rights reserved.
1919
* Copyright (c) 2015 Mellanox Technologies, Inc.
2020
* All rights reserved.
21+
* Copyright (c) 2017 IBM Corporation. All rights reserved.
2122
* $COPYRIGHT$
2223
*
2324
* Additional copyrights may follow
@@ -29,6 +30,7 @@
2930
#define OPAL_PARAMS_H
3031

3132
extern char *opal_signal_string;
33+
extern char *opal_stacktrace_output_filename;
3234
extern char *opal_net_private_ipv4;
3335
extern char *opal_set_max_sys_limits;
3436

opal/util/stacktrace.c

Lines changed: 150 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2006 Sun Microsystems, Inc. All rights reserved.
1313
* Copyright (c) 2008-2009 Cisco Systems, Inc. All rights reserved.
14+
* Copyright (c) 2017 IBM Corporation. All rights reserved.
1415
* $COPYRIGHT$
1516
*
1617
* Additional copyrights may follow
@@ -24,6 +25,15 @@
2425
#ifdef HAVE_UNISTD_H
2526
#include <unistd.h>
2627
#endif
28+
#ifdef HAVE_SYS_TYPES_H
29+
#include <sys/types.h>
30+
#endif
31+
#ifdef HAVE_SYS_STAT_H
32+
#include <sys/stat.h>
33+
#endif
34+
#ifdef HAVE_SYS_FCNTL_H
35+
#include <fcntl.h>
36+
#endif
2737

2838
#include <string.h>
2939
#include <signal.h>
@@ -34,6 +44,7 @@
3444
#include "opal/util/output.h"
3545
#include "opal/util/show_help.h"
3646
#include "opal/util/argv.h"
47+
#include "opal/util/proc.h"
3748
#include "opal/runtime/opal_params.h"
3849

3950
#ifndef _NSIG
@@ -42,9 +53,35 @@
4253

4354
#define HOSTFORMAT "[%s:%05d] "
4455

56+
int opal_stacktrace_output_fileno = -1;
57+
static char *opal_stacktrace_output_filename_base = NULL;
58+
static size_t opal_stacktrace_output_filename_max_len = 0;
4559
static char stacktrace_hostname[OPAL_MAXHOSTNAMELEN];
4660
static char *unable_to_print_msg = "Unable to print stack trace!\n";
4761

62+
/*
63+
* Set the stacktrace filename:
64+
* stacktrace.PID
65+
* -or, if VPID is available-
66+
* stacktrace.VPID.PID
67+
*/
68+
static void set_stacktrace_filename(void) {
69+
opal_proc_t *my_proc = opal_proc_local_get();
70+
71+
if( NULL == my_proc ) {
72+
snprintf(opal_stacktrace_output_filename, opal_stacktrace_output_filename_max_len,
73+
"%s.%lu",
74+
opal_stacktrace_output_filename_base, (unsigned long)getpid());
75+
}
76+
else {
77+
snprintf(opal_stacktrace_output_filename, opal_stacktrace_output_filename_max_len,
78+
"%s.%lu.%lu",
79+
opal_stacktrace_output_filename_base, (unsigned long)my_proc->proc_name.vpid, (unsigned long)getpid());
80+
}
81+
82+
return;
83+
}
84+
4885
/**
4986
* This function is being called as a signal-handler in response
5087
* to a user-specified signal (e.g. SIGFPE or SIGSEGV).
@@ -68,12 +105,37 @@ static void show_stackframe (int signo, siginfo_t * info, void * p)
68105
int ret;
69106
char *si_code_str = "";
70107

108+
/* Do not print the stack trace */
109+
if( 0 > opal_stacktrace_output_fileno && 0 == opal_stacktrace_output_filename_max_len ) {
110+
/* Raise the signal again, so we don't accidentally mask critical signals.
111+
* For critical signals, it is preferred that we call 'raise' instead of
112+
* 'exit' or 'abort' so that the return status is set properly for this
113+
* process.
114+
*/
115+
signal(signo, SIG_DFL);
116+
raise(signo);
117+
118+
return;
119+
}
120+
121+
/* Update the file name with the RANK, if available */
122+
if( 0 < opal_stacktrace_output_filename_max_len ) {
123+
set_stacktrace_filename();
124+
opal_stacktrace_output_fileno = open(opal_stacktrace_output_filename,
125+
O_CREAT|O_WRONLY|O_TRUNC, S_IRUSR|S_IWUSR);
126+
if( 0 > opal_stacktrace_output_fileno ) {
127+
opal_output(0, "Error: Failed to open the stacktrace output file. Default: stderr\n\tFilename: %s\n\tErrno: %s",
128+
opal_stacktrace_output_filename, strerror(errno));
129+
opal_stacktrace_output_fileno = fileno(stderr);
130+
}
131+
}
132+
71133
/* write out the footer information */
72134
memset (print_buffer, 0, sizeof (print_buffer));
73135
ret = snprintf(print_buffer, sizeof(print_buffer),
74136
HOSTFORMAT "*** Process received signal ***\n",
75137
stacktrace_hostname, getpid());
76-
write(fileno(stderr), print_buffer, ret);
138+
write(opal_stacktrace_output_fileno, print_buffer, ret);
77139

78140

79141
memset (print_buffer, 0, sizeof (print_buffer));
@@ -323,14 +385,14 @@ static void show_stackframe (int signo, siginfo_t * info, void * p)
323385
}
324386

325387
/* write out the signal information generated above */
326-
write(fileno(stderr), print_buffer, sizeof(print_buffer)-size);
388+
write(opal_stacktrace_output_fileno, print_buffer, sizeof(print_buffer)-size);
327389

328390
/* print out the stack trace */
329391
snprintf(print_buffer, sizeof(print_buffer), HOSTFORMAT,
330392
stacktrace_hostname, getpid());
331-
ret = opal_backtrace_print(stderr, print_buffer, 2);
393+
ret = opal_backtrace_print(NULL, print_buffer, 2);
332394
if (OPAL_SUCCESS != ret) {
333-
write(fileno(stderr), unable_to_print_msg, strlen(unable_to_print_msg));
395+
write(opal_stacktrace_output_fileno, unable_to_print_msg, strlen(unable_to_print_msg));
334396
}
335397

336398
/* write out the footer information */
@@ -339,10 +401,24 @@ static void show_stackframe (int signo, siginfo_t * info, void * p)
339401
HOSTFORMAT "*** End of error message ***\n",
340402
stacktrace_hostname, getpid());
341403
if (ret > 0) {
342-
write(fileno(stderr), print_buffer, ret);
404+
write(opal_stacktrace_output_fileno, print_buffer, ret);
343405
} else {
344-
write(fileno(stderr), unable_to_print_msg, strlen(unable_to_print_msg));
406+
write(opal_stacktrace_output_fileno, unable_to_print_msg, strlen(unable_to_print_msg));
407+
}
408+
409+
if( fileno(stdout) != opal_stacktrace_output_fileno &&
410+
fileno(stderr) != opal_stacktrace_output_fileno ) {
411+
close(opal_stacktrace_output_fileno);
412+
opal_stacktrace_output_fileno = -1;
345413
}
414+
415+
/* Raise the signal again, so we don't accidentally mask critical signals.
416+
* For critical signals, it is preferred that we call 'raise' instead of
417+
* 'exit' or 'abort' so that the return status is set properly for this
418+
* process.
419+
*/
420+
signal(signo, SIG_DFL);
421+
raise(signo);
346422
}
347423

348424
#endif /* OPAL_WANT_PRETTY_PRINT_STACKTRACE */
@@ -364,7 +440,30 @@ void opal_stackframe_output(int stream)
364440
opal_output(stream, "%s", traces[i]);
365441
}
366442
} else {
367-
opal_backtrace_print(stderr, NULL, 2);
443+
/* Do not print the stack trace */
444+
if( 0 > opal_stacktrace_output_fileno && 0 == opal_stacktrace_output_filename_max_len ) {
445+
return;
446+
}
447+
448+
/* Update the file name with the RANK, if available */
449+
if( 0 < opal_stacktrace_output_filename_max_len ) {
450+
set_stacktrace_filename();
451+
opal_stacktrace_output_fileno = open(opal_stacktrace_output_filename,
452+
O_CREAT|O_WRONLY|O_TRUNC, S_IRUSR|S_IWUSR);
453+
if( 0 > opal_stacktrace_output_fileno ) {
454+
opal_output(0, "Error: Failed to open the stacktrace output file. Default: stderr\n\tFilename: %s\n\tErrno: %s",
455+
opal_stacktrace_output_filename, strerror(errno));
456+
opal_stacktrace_output_fileno = fileno(stderr);
457+
}
458+
}
459+
460+
opal_backtrace_print(NULL, NULL, 2);
461+
462+
if( fileno(stdout) != opal_stacktrace_output_fileno &&
463+
fileno(stderr) != opal_stacktrace_output_fileno ) {
464+
close(opal_stacktrace_output_fileno);
465+
opal_stacktrace_output_fileno = -1;
466+
}
368467
}
369468
}
370469

@@ -435,6 +534,50 @@ int opal_util_register_stackhandlers (void)
435534
}
436535
}
437536

537+
/* Setup the output stream to use */
538+
if( NULL == opal_stacktrace_output_filename ||
539+
0 == strcasecmp(opal_stacktrace_output_filename, "none") ) {
540+
opal_stacktrace_output_fileno = -1;
541+
}
542+
else if( 0 == strcasecmp(opal_stacktrace_output_filename, "stdout") ) {
543+
opal_stacktrace_output_fileno = fileno(stdout);
544+
}
545+
else if( 0 == strcasecmp(opal_stacktrace_output_filename, "stderr") ) {
546+
opal_stacktrace_output_fileno = fileno(stdout);
547+
}
548+
else if( 0 == strcasecmp(opal_stacktrace_output_filename, "file" ) ||
549+
0 == strcasecmp(opal_stacktrace_output_filename, "file:") ) {
550+
opal_stacktrace_output_filename_base = strdup("stacktrace");
551+
552+
free(opal_stacktrace_output_filename);
553+
// Magic number: 8 = space for .PID and .RANK (allow 7 digits each)
554+
opal_stacktrace_output_filename_max_len = strlen("stacktrace") + 8 + 8;
555+
opal_stacktrace_output_filename = (char*)malloc(sizeof(char) * opal_stacktrace_output_filename_max_len);
556+
set_stacktrace_filename();
557+
opal_stacktrace_output_fileno = -1;
558+
}
559+
else if( 0 == strncasecmp(opal_stacktrace_output_filename, "file:", 5) ) {
560+
char *filename_cpy = NULL;
561+
next = strchr(opal_stacktrace_output_filename, ':');
562+
next++; // move past the ':' to the filename specified
563+
564+
opal_stacktrace_output_filename_base = strdup(next);
565+
566+
free(opal_stacktrace_output_filename);
567+
// Magic number: 8 = space for .PID and .RANK (allow 7 digits each)
568+
opal_stacktrace_output_filename_max_len = strlen(opal_stacktrace_output_filename_base) + 8 + 8;
569+
opal_stacktrace_output_filename = (char*)malloc(sizeof(char) * opal_stacktrace_output_filename_max_len);
570+
set_stacktrace_filename();
571+
opal_stacktrace_output_fileno = -1;
572+
573+
free(filename_cpy);
574+
}
575+
else {
576+
opal_stacktrace_output_fileno = fileno(stderr);
577+
}
578+
579+
580+
/* Setup the signals to catch */
438581
memset(&act, 0, sizeof(act));
439582
act.sa_sigaction = show_stackframe;
440583
act.sa_flags = SA_SIGINFO;

0 commit comments

Comments
 (0)