GPU initialization.

trivialfis · trivialfis · commit 4b5d38f07da0 · 2023-06-13T02:33:18.000+08:00
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
@@ -6,7 +6,7 @@
 #define XGBOOST_DATA_SPARSE_PAGE_SOURCE_H_
 
 #include <algorithm>  // for min
-#include <future>
+#include <future>     // async
 #include <map>
 #include <memory>
 #include <string>
@@ -18,6 +18,7 @@
 #include "../common/io.h"  // for PrivateMmapStream, PadPageForMMAP
 #include "../common/timer.h"
 #include "adapter.h"
+#include "dmlc/common.h"  // OMPException
 #include "proxy_dmatrix.h"
 #include "sparse_page_writer.h"
 #include "xgboost/base.h"
@@ -102,6 +103,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
   // A ring storing futures to data.  Since the DMatrix iterator is forward only, so we
   // can pre-fetch data in a ring.
   std::unique_ptr<Ring> ring_{new Ring};
+  dmlc::OMPException exec_;
 
   bool ReadCache() {
     CHECK(!at_end_);
@@ -119,35 +121,41 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S> {
     CHECK_GT(n_prefetch_batches, 0) << "total batches:" << n_batches_;
     std::size_t fetch_it = count_;
 
+    exec_.Rethrow();
+
     for (std::size_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
       fetch_it %= n_batches_;  // ring
       if (ring_->at(fetch_it).valid()) {
         continue;
       }
-      auto const *self = this;  // make sure it's const
+      auto const* self = this;  // make sure it's const
       CHECK_LT(fetch_it, cache_info_->offset.size());
-      ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self]() {
+      ring_->at(fetch_it) = std::async(std::launch::async, [fetch_it, self, this]() {
         auto page = std::make_shared<S>();
-
-        common::Timer timer;
-        timer.Start();
-        std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
-        auto n = self->cache_info_->ShardName();
-
-        std::uint64_t offset = self->cache_info_->offset.at(fetch_it);
-        std::uint64_t length = self->cache_info_->offset.at(fetch_it + 1) - offset;
-
-        auto fi = std::make_unique<common::PrivateMmapStream>(n, true, offset, length);
-        CHECK(fmt->Read(page.get(), fi.get()));
-        LOG(INFO) << "Read a page in " << timer.ElapsedSeconds() << " seconds.";
+        this->exec_.Run([&] {
+          common::Timer timer;
+          timer.Start();
+          std::unique_ptr<SparsePageFormat<S>> fmt{CreatePageFormat<S>("raw")};
+          auto n = self->cache_info_->ShardName();
+
+          std::uint64_t offset = self->cache_info_->offset.at(fetch_it);
+          std::uint64_t length = self->cache_info_->offset.at(fetch_it + 1) - offset;
+
+          auto fi = std::make_unique<common::PrivateMmapStream>(n, true, offset, length);
+          CHECK(fmt->Read(page.get(), fi.get()));
+          LOG(INFO) << "Read a page in " << timer.ElapsedSeconds() << " seconds.";
+        });
         return page;
       });
     }
+
     CHECK_EQ(std::count_if(ring_->cbegin(), ring_->cend(), [](auto const& f) { return f.valid(); }),
              n_prefetch_batches)
         << "Sparse DMatrix assumes forward iteration.";
     page_ = (*ring_)[count_].get();
     CHECK(!(*ring_)[count_].valid());
+    exec_.Rethrow();
+
     return true;
   }
 
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cu b/src/tree/gpu_hist/gradient_based_sampler.cu
@@ -154,20 +154,22 @@ GradientBasedSample NoSampling::Sample(Context const* ctx, common::Span<Gradient
   return {dmat->Info().num_row_, page, gpair};
 }
 
-ExternalMemoryNoSampling::ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page,
-                                                   size_t n_rows, BatchParam batch_param)
-    : batch_param_{std::move(batch_param)},
-      page_(new EllpackPageImpl(ctx->gpu_id, page->Cuts(), page->is_dense, page->row_stride,
-                                n_rows)) {}
+ExternalMemoryNoSampling::ExternalMemoryNoSampling(BatchParam batch_param)
+    : batch_param_{std::move(batch_param)} {}
 
 GradientBasedSample ExternalMemoryNoSampling::Sample(Context const* ctx,
                                                      common::Span<GradientPair> gpair,
                                                      DMatrix* dmat) {
   if (!page_concatenated_) {
     // Concatenate all the external memory ELLPACK pages into a single in-memory page.
+    page_.reset(nullptr);
     size_t offset = 0;
     for (auto& batch : dmat->GetBatches<EllpackPage>(ctx, batch_param_)) {
       auto page = batch.Impl();
+      if (!page_) {
+        page_ = std::make_unique<EllpackPageImpl>(ctx->gpu_id, page->Cuts(), page->is_dense,
+                                                  page->row_stride, dmat->Info().num_row_);
+      }
       size_t num_elements = page_->Copy(ctx->gpu_id, page, offset);
       offset += num_elements;
     }
@@ -319,13 +321,12 @@ GradientBasedSample ExternalMemoryGradientBasedSampling::Sample(Context const* c
   return {sample_rows, page_.get(), dh::ToSpan(gpair_)};
 }
 
-GradientBasedSampler::GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page,
+GradientBasedSampler::GradientBasedSampler(Context const* ctx, bool is_external_memory,
                                            size_t n_rows, const BatchParam& batch_param,
                                            float subsample, int sampling_method) {
   monitor_.Init("gradient_based_sampler");
 
   bool is_sampling = subsample < 1.0;
-  bool is_external_memory = page->n_rows != n_rows;
 
   if (is_sampling) {
     switch (sampling_method) {
@@ -338,17 +339,17 @@ GradientBasedSampler::GradientBasedSampler(Context const* ctx, EllpackPageImpl c
         break;
       case TrainParam::kGradientBased:
         if (is_external_memory) {
-          strategy_.reset(
-              new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
+          strategy_.reset(new ExternalMemoryGradientBasedSampling(n_rows, batch_param, subsample));
         } else {
           strategy_.reset(new GradientBasedSampling(n_rows, batch_param, subsample));
         }
         break;
-      default:LOG(FATAL) << "unknown sampling method";
+      default:
+        LOG(FATAL) << "unknown sampling method";
     }
   } else {
     if (is_external_memory) {
-      strategy_.reset(new ExternalMemoryNoSampling(ctx, page, n_rows, batch_param));
+      strategy_.reset(new ExternalMemoryNoSampling(batch_param));
     } else {
       strategy_.reset(new NoSampling(batch_param));
     }
diff --git a/src/tree/gpu_hist/gradient_based_sampler.cuh b/src/tree/gpu_hist/gradient_based_sampler.cuh
@@ -43,14 +43,13 @@ class NoSampling : public SamplingStrategy {
 /*! \brief No sampling in external memory mode. */
 class ExternalMemoryNoSampling : public SamplingStrategy {
  public:
-  ExternalMemoryNoSampling(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
-                           BatchParam batch_param);
+  explicit ExternalMemoryNoSampling(BatchParam batch_param);
   GradientBasedSample Sample(Context const* ctx, common::Span<GradientPair> gpair,
                              DMatrix* dmat) override;
 
  private:
   BatchParam batch_param_;
-  std::unique_ptr<EllpackPageImpl> page_;
+  std::unique_ptr<EllpackPageImpl> page_{nullptr};
   bool page_concatenated_{false};
 };
 
@@ -123,7 +122,7 @@ class ExternalMemoryGradientBasedSampling : public SamplingStrategy {
  */
 class GradientBasedSampler {
  public:
-  GradientBasedSampler(Context const* ctx, EllpackPageImpl const* page, size_t n_rows,
+  GradientBasedSampler(Context const* ctx, bool is_external_memory, size_t n_rows,
                        const BatchParam& batch_param, float subsample, int sampling_method);
 
   /*! \brief Sample from a DMatrix based on the given gradient pairs. */
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
@@ -205,32 +205,25 @@ struct GPUHistMakerDevice {
 
   std::unique_ptr<FeatureGroups> feature_groups;
 
-
-  GPUHistMakerDevice(Context const* ctx, EllpackPageImpl const* _page,
-                     common::Span<FeatureType const> _feature_types, bst_uint _n_rows,
+  GPUHistMakerDevice(Context const* ctx, bool is_external_memory,
+                     common::Span<FeatureType const> _feature_types, bst_row_t _n_rows,
                      TrainParam _param, uint32_t column_sampler_seed, uint32_t n_features,
                      BatchParam _batch_param)
       : evaluator_{_param, n_features, ctx->gpu_id},
         ctx_(ctx),
-        page(_page),
         feature_types{_feature_types},
         param(std::move(_param)),
         column_sampler(column_sampler_seed),
         interaction_constraints(param, n_features),
         batch_param(std::move(_batch_param)) {
-    sampler.reset(new GradientBasedSampler(ctx, page, _n_rows, batch_param, param.subsample,
-                                           param.sampling_method));
+    sampler.reset(new GradientBasedSampler(ctx, is_external_memory, _n_rows, batch_param,
+                                           param.subsample, param.sampling_method));
     if (!param.monotone_constraints.empty()) {
       // Copy assigning an empty vector causes an exception in MSVC debug builds
       monotone_constraints = param.monotone_constraints;
     }
 
-    // Init histogram
-    hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
     monitor.Init(std::string("GPUHistMakerDevice") + std::to_string(ctx_->gpu_id));
-    feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
-                                           dh::MaxSharedMemoryOptin(ctx_->gpu_id),
-                                           sizeof(GradientSumT)));
   }
 
   ~GPUHistMakerDevice() {  // NOLINT
@@ -247,9 +240,6 @@ struct GPUHistMakerDevice {
                               param.colsample_bytree);
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
 
-    this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param,
-                           ctx_->gpu_id);
-
     this->interaction_constraints.Reset();
 
     if (d_gpair.size() != dh_gpair->Size()) {
@@ -262,11 +252,22 @@ struct GPUHistMakerDevice {
     page = sample.page;
     gpair = sample.gpair;
 
+    this->evaluator_.Reset(page->Cuts(), feature_types, dmat->Info().num_col_, param, ctx_->gpu_id);
+
     quantiser.reset(new GradientQuantiser(this->gpair));
 
     row_partitioner.reset();  // Release the device memory first before reallocating
     row_partitioner.reset(new RowPartitioner(ctx_->gpu_id,  sample.sample_rows));
+
+    // Init histogram
+    hist.Init(ctx_->gpu_id, page->Cuts().TotalBins());
     hist.Reset();
+
+    if (!feature_groups) {
+      feature_groups.reset(new FeatureGroups(page->Cuts(), page->is_dense,
+                                             dh::MaxSharedMemoryOptin(ctx_->gpu_id),
+                                             sizeof(GradientSumT)));
+    }
   }
 
   GPUExpandEntry EvaluateRootSplit(GradientPairInt64 root_sum) {
@@ -809,12 +810,11 @@ class GPUHistMaker : public TreeUpdater {
     collective::Broadcast(&column_sampling_seed, sizeof(column_sampling_seed), 0);
 
     auto batch_param = BatchParam{param->max_bin, TrainParam::DftSparseThreshold()};
-    auto page = (*dmat->GetBatches<EllpackPage>(ctx_, batch_param).begin()).Impl();
     dh::safe_cuda(cudaSetDevice(ctx_->gpu_id));
     info_->feature_types.SetDevice(ctx_->gpu_id);
     maker.reset(new GPUHistMakerDevice<GradientSumT>(
-        ctx_, page, info_->feature_types.ConstDeviceSpan(), info_->num_row_, *param,
-        column_sampling_seed, info_->num_col_, batch_param));
+        ctx_, !dmat->SingleColBlock(), info_->feature_types.ConstDeviceSpan(), info_->num_row_,
+        *param, column_sampling_seed, info_->num_col_, batch_param));
 
     p_last_fmat_ = dmat;
     initialised_ = true;