Skip to content

Commit b12174d

Browse files
committed
Report error position.
1 parent 3708601 commit b12174d

File tree

2 files changed

+20
-12
lines changed

2 files changed

+20
-12
lines changed

rabit/src/allreduce_base.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -217,7 +217,7 @@ void AllreduceBase::SetParam(const char *name, const char *val) {
217217
rabit_enable_tcp_no_delay = true;
218218
} else {
219219
rabit_enable_tcp_no_delay = false;
220-
}
220+
}
221221
}
222222
}
223223
/*!

rabit/src/allreduce_base.h

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -109,9 +109,11 @@ class AllreduceBase : public IEngine {
109109
if (world_size == 1 || world_size == -1) {
110110
return;
111111
}
112-
utils::Assert(TryAllgatherRing(sendrecvbuf_, total_size, slice_begin,
113-
slice_end, size_prev_slice) == kSuccess,
114-
"AllgatherRing failed");
112+
auto ret = TryAllgatherRing(sendrecvbuf_, total_size, slice_begin,
113+
slice_end, size_prev_slice);
114+
if (ret != kSuccess) {
115+
utils::Error("AllgatherRing failed: %d\n", ret.line);
116+
}
115117
}
116118
/*!
117119
* \brief perform in-place allreduce, on sendrecvbuf
@@ -135,9 +137,10 @@ class AllreduceBase : public IEngine {
135137
const char *_caller = _CALLER) override {
136138
if (prepare_fun != nullptr) prepare_fun(prepare_arg);
137139
if (world_size == 1 || world_size == -1) return;
138-
utils::Assert(TryAllreduce(sendrecvbuf_, type_nbytes, count, reducer) ==
139-
kSuccess,
140-
"Allreduce failed");
140+
auto ret = TryAllreduce(sendrecvbuf_, type_nbytes, count, reducer);
141+
if (ret != kSuccess) {
142+
utils::Error("Allreduce failed: %d\n", ret.line);
143+
}
141144
}
142145
/*!
143146
* \brief broadcast data from root to all nodes
@@ -152,8 +155,10 @@ class AllreduceBase : public IEngine {
152155
const char *_file = _FILE, const int _line = _LINE,
153156
const char *_caller = _CALLER) override {
154157
if (world_size == 1 || world_size == -1) return;
155-
utils::Assert(TryBroadcast(sendrecvbuf_, total_size, root) == kSuccess,
156-
"Broadcast failed");
158+
auto ret = TryBroadcast(sendrecvbuf_, total_size, root);
159+
if (ret != kSuccess) {
160+
utils::Error("Broadcast failed: %d\n", ret.line);
161+
}
157162
}
158163
/*!
159164
* \brief load latest check point
@@ -272,9 +277,11 @@ class AllreduceBase : public IEngine {
272277
struct ReturnType {
273278
/*! \brief internal return type */
274279
ReturnTypeEnum value;
280+
int32_t line { -1 };
275281
// constructor
276-
ReturnType() = default;
277-
ReturnType(ReturnTypeEnum value) : value(value) {} // NOLINT(*)
282+
explicit ReturnType(int l = __builtin_LINE()) : line{l} {}
283+
ReturnType(ReturnTypeEnum value, int32_t l = __builtin_LINE()) : value(value), line{l} {} // NOLINT(*)
284+
278285
inline bool operator==(const ReturnTypeEnum &v) const {
279286
return value == v;
280287
}
@@ -518,7 +525,8 @@ class AllreduceBase : public IEngine {
518525
* \param err the error type
519526
*/
520527
inline ReturnType ReportError(LinkRecord *link, ReturnType err) {
521-
err_link = link; return err;
528+
err_link = link;
529+
return err;
522530
}
523531
//---- data structure related to model ----
524532
// call sequence counter, records how many calls we made so far

0 commit comments

Comments
 (0)