Removed extra state from histogram computation.

canonizer · canonizer · commit 9c0a6369cf60 · 2020-04-14T15:54:59.000-07:00
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
@@ -146,64 +146,40 @@ __global__ void SharedMemHistKernel(EllpackDeviceAccessor matrix,
 }
 
 template <typename GradientSumT>
-HistogramLaunchConfig InitGradientHistogram(int device_idx, int n_bins) {
-  // opt into maximum shared memory for the kernel
-  int max_shared_memory = dh::MaxSharedMemoryOptin(device_idx);
+void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
+                            common::Span<GradientPair const> gpair,
+                            common::Span<const uint32_t> d_ridx,
+                            common::Span<GradientSumT> histogram,
+                            GradientSumT rounding) {
+  // decide whether to use shared memory  
+  int device = 0;
+  dh::safe_cuda(cudaGetDevice(&device));
+  int max_shared_memory = dh::MaxSharedMemoryOptin(device);
+  size_t smem_size = sizeof(GradientSumT) * matrix.NumBins();  
+  bool shared = smem_size <= max_shared_memory;
+  smem_size = shared ? smem_size : 0;
+
+  // opt into maximum shared memory for the kernel if necessary
   auto kernel = SharedMemHistKernel<GradientSumT>;
-  dh::safe_cuda(cudaFuncSetAttribute
-                (kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
-                 max_shared_memory));
-
-  // find the optimal configuration for the specified bin count
-  HistogramLaunchConfig config;
-  config.shared = n_bins * sizeof(GradientSumT) <= max_shared_memory;
-  config.block_threads = 256;
-  int smem_size = config.shared ? n_bins * sizeof(GradientSumT) : 0;
-
-  if (config.shared) {
-    // find the optimal number of threads
-    int max_threads_per_mp = 0;
-    dh::safe_cuda(cudaDeviceGetAttribute
-                  (&max_threads_per_mp,
-                   cudaDevAttrMaxThreadsPerMultiProcessor, device_idx));
-    int warp_size = 32;
-    int max_kernel_threads_per_mp = 0;
-    for (int block_threads = 128; block_threads <= max_threads_per_mp;
-         block_threads += warp_size) {
-      int n_kernel_blocks_per_mp = 0;
-      dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor
-                (&n_kernel_blocks_per_mp, kernel, block_threads, smem_size));
-      if (n_kernel_blocks_per_mp * block_threads > max_kernel_threads_per_mp) {
-        config.block_threads = unsigned(block_threads);
-        max_kernel_threads_per_mp = n_kernel_blocks_per_mp * block_threads;        
-      }
-    }
+  if (shared) {
+    dh::safe_cuda(cudaFuncSetAttribute
+                  (kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                   max_shared_memory));
   }
-  
+
+  // determine the launch configuration
+  unsigned block_threads = shared ? 1024 : 256;
   int n_mps = 0;
-  dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device_idx));  
+  dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));  
   int n_blocks_per_mp = 0;
   dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor
-                (&n_blocks_per_mp, kernel, config.block_threads, smem_size));
-  config.grid_size = n_blocks_per_mp * n_mps;
-
-  return config;
-}
-
-template <typename GradientSumT>
-void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
-                            common::Span<GradientPair const> gpair,
-                            common::Span<const uint32_t> d_ridx,
-                            common::Span<GradientSumT> histogram,
-                            GradientSumT rounding, const HistogramLaunchConfig& config) {
-  const size_t smem_size =
-      config.shared ? sizeof(GradientSumT) * matrix.NumBins() : 0;
+                (&n_blocks_per_mp, kernel, block_threads, smem_size));
+  unsigned grid_size = n_blocks_per_mp * n_mps;  
+  
   auto n_elements = d_ridx.size() * matrix.row_stride;
-
-  auto kernel = SharedMemHistKernel<GradientSumT>;
-  dh::LaunchKernel {config.grid_size, config.block_threads, smem_size} (
+  dh::LaunchKernel {grid_size, block_threads, smem_size} (
       kernel, matrix, d_ridx, histogram.data(), gpair.data(), n_elements,
-      rounding, config.shared);
+      rounding, shared);
   dh::safe_cuda(cudaGetLastError());
 }
 
@@ -212,20 +188,14 @@ template void BuildGradientHistogram<GradientPair>(
     common::Span<GradientPair const> gpair,
     common::Span<const uint32_t> ridx,
     common::Span<GradientPair> histogram,
-    GradientPair rounding, const HistogramLaunchConfig& config);
+    GradientPair rounding);
 
 template void BuildGradientHistogram<GradientPairPrecise>(
     EllpackDeviceAccessor const& matrix,
     common::Span<GradientPair const> gpair,
     common::Span<const uint32_t> ridx,
     common::Span<GradientPairPrecise> histogram,
-    GradientPairPrecise rounding, const HistogramLaunchConfig& config);
-
-template HistogramLaunchConfig InitGradientHistogram<GradientPair>
-(int device_idx, int n_bins);
-  
-template HistogramLaunchConfig InitGradientHistogram<GradientPairPrecise>
-(int device_idx, int n_bins);
+    GradientPairPrecise rounding);
 
 }  // namespace tree
 }  // namespace xgboost
diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh
@@ -17,25 +17,12 @@ DEV_INLINE T TruncateWithRoundingFactor(T const rounding_factor, float const x)
   return (rounding_factor + static_cast<T>(x)) - rounding_factor;
 }
 
-struct HistogramLaunchConfig {
-  unsigned grid_size;
-  unsigned block_threads;
-  bool shared;
-  HistogramLaunchConfig(unsigned grid_size, unsigned block_threads, bool shared) :
-    grid_size(grid_size), block_threads(block_threads), shared(shared) {}
-  HistogramLaunchConfig() : grid_size(80), block_threads(256), shared(false) {}
-};
-
-template <typename GradientSumT>
-HistogramLaunchConfig InitGradientHistogram(int device_idx, int n_bins);
-
 template <typename GradientSumT>
 void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
                             common::Span<GradientPair const> gpair,
                             common::Span<const uint32_t> ridx,
                             common::Span<GradientSumT> histogram,
-                            GradientSumT rounding,
-                            const HistogramLaunchConfig& config);
+                            GradientSumT rounding);
 }  // namespace tree
 }  // namespace xgboost
 
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
@@ -427,7 +427,6 @@ struct GPUHistMakerDevice {
   TrainParam param;
   bool deterministic_histogram;
   bool prediction_cache_initialised;
-  HistogramLaunchConfig histogram_config;
 
   GradientSumT histogram_rounding;
 
@@ -619,7 +618,7 @@ struct GPUHistMakerDevice {
     auto d_node_hist = hist.GetNodeHistogram(nidx);
     auto d_ridx = row_partitioner->GetRows(nidx);
     BuildGradientHistogram(page->GetDeviceAccessor(device_id), gpair, d_ridx, d_node_hist,
-                           histogram_rounding, histogram_config);
+                           histogram_rounding);
   }
 
   void SubtractionTrick(int nidx_parent, int nidx_histogram,
@@ -946,7 +945,6 @@ inline void GPUHistMakerDevice<GradientSumT>::InitHistogram() {
 
   // Init histogram
   hist.Init(device_id, n_bins);
-  histogram_config = InitGradientHistogram<GradientSumT>(device_id, n_bins);
 }
 
 template <typename GradientSumT>
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -25,19 +25,17 @@ void TestDeterminsticHistogram() {
     auto gpair = GenerateRandomGradients(kRows, kLower, kUpper);
     gpair.SetDevice(0);
 
-    HistogramLaunchConfig histogram_config(80, 256, true);
-
     auto rounding = CreateRoundingFactor<Gradient>(gpair.DeviceSpan());
     BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
-                           d_histogram, rounding, histogram_config);
+                           d_histogram, rounding);
 
     for (size_t i = 0; i < kRounds; ++i) {
       dh::device_vector<Gradient> new_histogram(kBins * kCols);
       auto d_histogram = dh::ToSpan(new_histogram);
 
       auto rounding = CreateRoundingFactor<Gradient>(gpair.DeviceSpan());
       BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
-                             d_histogram, rounding, histogram_config);
+                             d_histogram, rounding);
 
       for (size_t j = 0; j < new_histogram.size(); ++j) {
         ASSERT_EQ(((Gradient)new_histogram[j]).GetGrad(),
@@ -52,7 +50,7 @@ void TestDeterminsticHistogram() {
       gpair.SetDevice(0);
       dh::device_vector<Gradient> baseline(kBins * kCols);
       BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
-                             dh::ToSpan(baseline), rounding, histogram_config);
+                             dh::ToSpan(baseline), rounding);
       for (size_t i = 0; i < baseline.size(); ++i) {
         EXPECT_NEAR(((Gradient)baseline[i]).GetGrad(), ((Gradient)histogram[i]).GetGrad(),
                     ((Gradient)baseline[i]).GetGrad() * 1e-3);
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
@@ -105,7 +105,6 @@ void TestBuildHist(bool use_shared_memory_histograms) {
   maker.hist.AllocateHistogram(0);
   maker.gpair = gpair.DeviceSpan();
 
-  maker.histogram_config = HistogramLaunchConfig(80, 256, use_shared_memory_histograms);
   maker.BuildHist(0);
   DeviceHistogram<GradientSumT> d_hist = maker.hist;