Skip to content

Commit 4b5d38f

Browse files
committed
GPU initialization.
1 parent 68b838d commit 4b5d38f

File tree

4 files changed

+55
-47
lines changed

4 files changed

+55
-47
lines changed

src/data/sparse_page_source.h

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
#define XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
77

88
#include <algorithm> // for min
9-
#include <future>
9+
#include <future> // async
1010
#include <map>
1111
#include <memory>
1212
#include <string>
@@ -18,6 +18,7 @@
1818
#include "../common/io.h" // for PrivateMmapStream, PadPageForMMAP
1919
#include "../common/timer.h"
2020
#include "adapter.h"
21+
#include "dmlc/common.h" // OMPException
2122
#include "proxy_dmatrix.h"
2223
#include "sparse_page_writer.h"
2324
#include "xgboost/base.h"
@@ -102,6 +103,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
102103
// A ring storing futures to data. Since the DMatrix iterator is forward only, so we
103104
// can pre-fetch data in a ring.
104105
std::unique_ptr<Ring> ring_{new Ring};
106+
dmlc::OMPException exec_;
105107

106108
bool ReadCache() {
107109
CHECK(!at_end_);
@@ -119,35 +121,41 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
119121
CHECK_GT(n_prefetch_batches, 0) << "total batches:" << n_batches_;
120122
std::size_t fetch_it = count_;
121123

124+
exec_.Rethrow();
125+
122126
for (std::size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
123127
fetch_it %= n_batches_; // ring
124128
if (ring_->at(fetch_it).valid()) {
125129
continue;
126130
}
127-
auto const *self = this; // make sure it's const
131+
auto const* self = this; // make sure it's const
128132
CHECK_LT(fetch_it, cache_info_->offset.size());
129-
ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self]() {
133+
ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self, this]() {
130134
auto page = std::make_shared<S>();
131-
132-
common::Timer timer;
133-
timer.Start();
134-
std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
135-
auto n = self->cache_info_->ShardName();
136-
137-
std::uint64_t offset = self->cache_info_->offset.at(fetch_it);
138-
std::uint64_t length = self->cache_info_->offset.at(fetch_it + 1) - offset;
139-
140-
auto fi = std::make_unique<common::PrivateMmapStream>(n, true, offset, length);
141-
CHECK(fmt->Read(page.get(), fi.get()));
142-
LOG(INFO) << "Read a page in " << timer.ElapsedSeconds() << " seconds.";
135+
this->exec_.Run([&] {
136+
common::Timer timer;
137+
timer.Start();
138+
std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
139+
auto n = self->cache_info_->ShardName();
140+
141+
std::uint64_t offset = self->cache_info_->offset.at(fetch_it);
142+
std::uint64_t length = self->cache_info_->offset.at(fetch_it + 1) - offset;
143+
144+
auto fi = std::make_unique<common::PrivateMmapStream>(n, true, offset, length);
145+
CHECK(fmt->Read(page.get(), fi.get()));
146+
LOG(INFO) << "Read a page in " << timer.ElapsedSeconds() << " seconds.";
147+
});
143148
return page;
144149
});
145150
}
151+
146152
CHECK_EQ(std::count_if(ring_->cbegin(), ring_->cend(), [](auto const& f) { return f.valid(); }),
147153
n_prefetch_batches)
148154
<< "Sparse DMatrix assumes forward iteration.";
149155
page_ = (*ring_)[count_].get();
150156
CHECK(!(*ring_)[count_].valid());
157+
exec_.Rethrow();
158+
151159
return true;
152160
}
153161

src/tree/gpu_hist/gradient_based_sampler.cu

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -154,20 +154,22 @@ GradientBasedSample NoSampling::Sample(Context const* ctx, common::Span<Gradient
154154
return {dmat->Info().num_row_, page, gpair};
155155
}
156156

157-
ExternalMemoryNoSampling::ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page,
158-
size_t n_rows, BatchParam batch_param)
159-
: batch_param_{std::move(batch_param)},
160-
page_(new EllpackPageImpl(ctx->gpu_id, page->Cuts(), page->is_dense, page->row_stride,
161-
n_rows)) {}
157+
ExternalMemoryNoSampling::ExternalMemoryNoSampling(BatchParam batch_param)
158+
: batch_param_{std::move(batch_param)} {}
162159

163160
GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
164161
common::Span<GradientPair> gpair,
165162
DMatrix* dmat) {
166163
if (!page_concatenated_) {
167164
// Concatenate all the external memory ELLPACK pages into a single in-memory page.
165+
page_.reset(nullptr);
168166
size_t offset = 0;
169167
for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
170168
auto page = batch.Impl();
169+
if (!page_) {
170+
page_ = std::make_unique<EllpackPageImpl>(ctx->gpu_id, page->Cuts(), page->is_dense,
171+
page->row_stride, dmat->Info().num_row_);
172+
}
171173
size_t num_elements = page_->Copy(ctx->gpu_id, page, offset);
172174
offset += num_elements;
173175
}
@@ -319,13 +321,12 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
319321
return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
320322
}
321323

322-
GradientBasedSampler::GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page,
324+
GradientBasedSampler::GradientBasedSampler(Context const* ctx, bool is_external_memory,
323325
size_t n_rows, const BatchParam& batch_param,
324326
float subsample, int sampling_method) {
325327
monitor_.Init("gradient_based_sampler");
326328

327329
bool is_sampling = subsample < 1.0;
328-
bool is_external_memory = page->n_rows != n_rows;
329330

330331
if (is_sampling) {
331332
switch (sampling_method) {
@@ -338,17 +339,17 @@ GradientBasedSampler::GradientBasedSampler(Context const* ctx, EllpackPageImpl c
338339
break;
339340
case TrainParam::kGradientBased:
340341
if (is_external_memory) {
341-
strategy_.reset(
342-
new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
342+
strategy_.reset(new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
343343
} else {
344344
strategy_.reset(new GradientBasedSampling(n_rows, batch_param, subsample));
345345
}
346346
break;
347-
default:LOG(FATAL) << "unknown sampling method";
347+
default:
348+
LOG(FATAL) << "unknown sampling method";
348349
}
349350
} else {
350351
if (is_external_memory) {
351-
strategy_.reset(new ExternalMemoryNoSampling(ctx, page, n_rows, batch_param));
352+
strategy_.reset(new ExternalMemoryNoSampling(batch_param));
352353
} else {
353354
strategy_.reset(new NoSampling(batch_param));
354355
}

src/tree/gpu_hist/gradient_based_sampler.cuh

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,13 @@ class NoSampling : public SamplingStrategy {
4343
/*! \brief No sampling in external memory mode. */
4444
class ExternalMemoryNoSampling : public SamplingStrategy {
4545
public:
46-
ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
47-
BatchParam batch_param);
46+
explicit ExternalMemoryNoSampling(BatchParam batch_param);
4847
GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
4948
DMatrix* dmat) override;
5049

5150
private:
5251
BatchParam batch_param_;
53-
std::unique_ptr<EllpackPageImpl> page_;
52+
std::unique_ptr<EllpackPageImpl> page_{nullptr};
5453
bool page_concatenated_{false};
5554
};
5655

@@ -123,7 +122,7 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
123122
*/
124123
class GradientBasedSampler {
125124
public:
126-
GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
125+
GradientBasedSampler(Context const* ctx, bool is_external_memory, size_t n_rows,
127126
const BatchParam& batch_param, float subsample, int sampling_method);
128127

129128
/*! \brief Sample from a DMatrix based on the given gradient pairs. */

src/tree/updater_gpu_hist.cu

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -205,32 +205,25 @@ struct GPUHistMakerDevice {
205205

206206
std::unique_ptr<FeatureGroups> feature_groups;
207207

208-
209-
GPUHistMakerDevice(Context const* ctx, EllpackPageImpl const* _page,
210-
common::Span<FeatureType const> _feature_types, bst_uint _n_rows,
208+
GPUHistMakerDevice(Context const* ctx, bool is_external_memory,
209+
common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
211210
TrainParam _param, uint32_t column_sampler_seed, uint32_t n_features,
212211
BatchParam _batch_param)
213212
: evaluator_{_param, n_features, ctx->gpu_id},
214213
ctx_(ctx),
215-
page(_page),
216214
feature_types{_feature_types},
217215
param(std::move(_param)),
218216
column_sampler(column_sampler_seed),
219217
interaction_constraints(param, n_features),
220218
batch_param(std::move(_batch_param)) {
221-
sampler.reset(new GradientBasedSampler(ctx, page, _n_rows, batch_param, param.subsample,
222-
param.sampling_method));
219+
sampler.reset(new GradientBasedSampler(ctx, is_external_memory, _n_rows, batch_param,
220+
param.subsample, param.sampling_method));
223221
if (!param.monotone_constraints.empty()) {
224222
// Copy assigning an empty vector causes an exception in MSVC debug builds
225223
monotone_constraints = param.monotone_constraints;
226224
}
227225

228-
// Init histogram
229-
hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
230226
monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
231-
feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
232-
dh::MaxSharedMemoryOptin(ctx_->gpu_id),
233-
sizeof(GradientSumT)));
234227
}
235228

236229
~GPUHistMakerDevice() { // NOLINT
@@ -247,9 +240,6 @@ struct GPUHistMakerDevice {
247240
param.colsample_bytree);
248241
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
249242

250-
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
251-
ctx_->gpu_id);
252-
253243
this->interaction_constraints.Reset();
254244

255245
if (d_gpair.size() != dh_gpair->Size()) {
@@ -262,11 +252,22 @@ struct GPUHistMakerDevice {
262252
page = sample.page;
263253
gpair = sample.gpair;
264254

255+
this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param, ctx_->gpu_id);
256+
265257
quantiser.reset(new GradientQuantiser(this->gpair));
266258

267259
row_partitioner.reset(); // Release the device memory first before reallocating
268260
row_partitioner.reset(new RowPartitioner(ctx_->gpu_id, sample.sample_rows));
261+
262+
// Init histogram
263+
hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
269264
hist.Reset();
265+
266+
if (!feature_groups) {
267+
feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
268+
dh::MaxSharedMemoryOptin(ctx_->gpu_id),
269+
sizeof(GradientSumT)));
270+
}
270271
}
271272

272273
GPUExpandEntry EvaluateRootSplit(GradientPairInt64 root_sum) {
@@ -809,12 +810,11 @@ class GPUHistMaker : public TreeUpdater {
809810
collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
810811

811812
auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
812-
auto page = (*dmat->GetBatches<EllpackPage>(ctx_, batch_param).begin()).Impl();
813813
dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
814814
info_->feature_types.SetDevice(ctx_->gpu_id);
815815
maker.reset(new GPUHistMakerDevice<GradientSumT>(
816-
ctx_, page, info_->feature_types.ConstDeviceSpan(), info_->num_row_, *param,
817-
column_sampling_seed, info_->num_col_, batch_param));
816+
ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
817+
*param, column_sampling_seed, info_->num_col_, batch_param));
818818

819819
p_last_fmat_ = dmat;
820820
initialised_ = true;

0 commit comments

Comments
 (0)