Skip to content

Commit 9c0a636

Browse files
committed
Removed extra state from histogram computation.
1 parent f152486 commit 9c0a636

File tree

5 files changed

+34
-82
lines changed

5 files changed

+34
-82
lines changed

src/tree/gpu_hist/histogram.cu

Lines changed: 29 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -146,64 +146,40 @@ __global__ void SharedMemHistKernel(EllpackDeviceAccessor matrix,
146146
}
147147

148148
template <typename GradientSumT>
149-
HistogramLaunchConfig InitGradientHistogram(int device_idx, int n_bins) {
150-
// opt into maximum shared memory for the kernel
151-
int max_shared_memory = dh::MaxSharedMemoryOptin(device_idx);
149+
void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
150+
common::Span<GradientPair const> gpair,
151+
common::Span<const uint32_t> d_ridx,
152+
common::Span<GradientSumT> histogram,
153+
GradientSumT rounding) {
154+
// decide whether to use shared memory
155+
int device = 0;
156+
dh::safe_cuda(cudaGetDevice(&device));
157+
int max_shared_memory = dh::MaxSharedMemoryOptin(device);
158+
size_t smem_size = sizeof(GradientSumT) * matrix.NumBins();
159+
bool shared = smem_size <= max_shared_memory;
160+
smem_size = shared ? smem_size : 0;
161+
162+
// opt into maximum shared memory for the kernel if necessary
152163
auto kernel = SharedMemHistKernel<GradientSumT>;
153-
dh::safe_cuda(cudaFuncSetAttribute
154-
(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
155-
max_shared_memory));
156-
157-
// find the optimal configuration for the specified bin count
158-
HistogramLaunchConfig config;
159-
config.shared = n_bins * sizeof(GradientSumT) <= max_shared_memory;
160-
config.block_threads = 256;
161-
int smem_size = config.shared ? n_bins * sizeof(GradientSumT) : 0;
162-
163-
if (config.shared) {
164-
// find the optimal number of threads
165-
int max_threads_per_mp = 0;
166-
dh::safe_cuda(cudaDeviceGetAttribute
167-
(&max_threads_per_mp,
168-
cudaDevAttrMaxThreadsPerMultiProcessor, device_idx));
169-
int warp_size = 32;
170-
int max_kernel_threads_per_mp = 0;
171-
for (int block_threads = 128; block_threads <= max_threads_per_mp;
172-
block_threads += warp_size) {
173-
int n_kernel_blocks_per_mp = 0;
174-
dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor
175-
(&n_kernel_blocks_per_mp, kernel, block_threads, smem_size));
176-
if (n_kernel_blocks_per_mp * block_threads > max_kernel_threads_per_mp) {
177-
config.block_threads = unsigned(block_threads);
178-
max_kernel_threads_per_mp = n_kernel_blocks_per_mp * block_threads;
179-
}
180-
}
164+
if (shared) {
165+
dh::safe_cuda(cudaFuncSetAttribute
166+
(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
167+
max_shared_memory));
181168
}
182-
169+
170+
// determine the launch configuration
171+
unsigned block_threads = shared ? 1024 : 256;
183172
int n_mps = 0;
184-
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device_idx));
173+
dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
185174
int n_blocks_per_mp = 0;
186175
dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor
187-
(&n_blocks_per_mp, kernel, config.block_threads, smem_size));
188-
config.grid_size = n_blocks_per_mp * n_mps;
189-
190-
return config;
191-
}
192-
193-
template <typename GradientSumT>
194-
void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
195-
common::Span<GradientPair const> gpair,
196-
common::Span<const uint32_t> d_ridx,
197-
common::Span<GradientSumT> histogram,
198-
GradientSumT rounding, const HistogramLaunchConfig& config) {
199-
const size_t smem_size =
200-
config.shared ? sizeof(GradientSumT) * matrix.NumBins() : 0;
176+
(&n_blocks_per_mp, kernel, block_threads, smem_size));
177+
unsigned grid_size = n_blocks_per_mp * n_mps;
178+
201179
auto n_elements = d_ridx.size() * matrix.row_stride;
202-
203-
auto kernel = SharedMemHistKernel<GradientSumT>;
204-
dh::LaunchKernel {config.grid_size, config.block_threads, smem_size} (
180+
dh::LaunchKernel {grid_size, block_threads, smem_size} (
205181
kernel, matrix, d_ridx, histogram.data(), gpair.data(), n_elements,
206-
rounding, config.shared);
182+
rounding, shared);
207183
dh::safe_cuda(cudaGetLastError());
208184
}
209185

@@ -212,20 +188,14 @@ template void BuildGradientHistogram<GradientPair>(
212188
common::Span<GradientPair const> gpair,
213189
common::Span<const uint32_t> ridx,
214190
common::Span<GradientPair> histogram,
215-
GradientPair rounding, const HistogramLaunchConfig& config);
191+
GradientPair rounding);
216192

217193
template void BuildGradientHistogram<GradientPairPrecise>(
218194
EllpackDeviceAccessor const& matrix,
219195
common::Span<GradientPair const> gpair,
220196
common::Span<const uint32_t> ridx,
221197
common::Span<GradientPairPrecise> histogram,
222-
GradientPairPrecise rounding, const HistogramLaunchConfig& config);
223-
224-
template HistogramLaunchConfig InitGradientHistogram<GradientPair>
225-
(int device_idx, int n_bins);
226-
227-
template HistogramLaunchConfig InitGradientHistogram<GradientPairPrecise>
228-
(int device_idx, int n_bins);
198+
GradientPairPrecise rounding);
229199

230200
} // namespace tree
231201
} // namespace xgboost

src/tree/gpu_hist/histogram.cuh

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -17,25 +17,12 @@ DEV_INLINE T TruncateWithRoundingFactor(T const rounding_factor, float const x)
1717
return (rounding_factor + static_cast<T>(x)) - rounding_factor;
1818
}
1919

20-
struct HistogramLaunchConfig {
21-
unsigned grid_size;
22-
unsigned block_threads;
23-
bool shared;
24-
HistogramLaunchConfig(unsigned grid_size, unsigned block_threads, bool shared) :
25-
grid_size(grid_size), block_threads(block_threads), shared(shared) {}
26-
HistogramLaunchConfig() : grid_size(80), block_threads(256), shared(false) {}
27-
};
28-
29-
template <typename GradientSumT>
30-
HistogramLaunchConfig InitGradientHistogram(int device_idx, int n_bins);
31-
3220
template <typename GradientSumT>
3321
void BuildGradientHistogram(EllpackDeviceAccessor const& matrix,
3422
common::Span<GradientPair const> gpair,
3523
common::Span<const uint32_t> ridx,
3624
common::Span<GradientSumT> histogram,
37-
GradientSumT rounding,
38-
const HistogramLaunchConfig& config);
25+
GradientSumT rounding);
3926
} // namespace tree
4027
} // namespace xgboost
4128

src/tree/updater_gpu_hist.cu

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -427,7 +427,6 @@ struct GPUHistMakerDevice {
427427
TrainParam param;
428428
bool deterministic_histogram;
429429
bool prediction_cache_initialised;
430-
HistogramLaunchConfig histogram_config;
431430

432431
GradientSumT histogram_rounding;
433432

@@ -619,7 +618,7 @@ struct GPUHistMakerDevice {
619618
auto d_node_hist = hist.GetNodeHistogram(nidx);
620619
auto d_ridx = row_partitioner->GetRows(nidx);
621620
BuildGradientHistogram(page->GetDeviceAccessor(device_id), gpair, d_ridx, d_node_hist,
622-
histogram_rounding, histogram_config);
621+
histogram_rounding);
623622
}
624623

625624
void SubtractionTrick(int nidx_parent, int nidx_histogram,
@@ -946,7 +945,6 @@ inline void GPUHistMakerDevice<GradientSumT>::InitHistogram() {
946945

947946
// Init histogram
948947
hist.Init(device_id, n_bins);
949-
histogram_config = InitGradientHistogram<GradientSumT>(device_id, n_bins);
950948
}
951949

952950
template <typename GradientSumT>

tests/cpp/tree/gpu_hist/test_histogram.cu

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -25,19 +25,17 @@ void TestDeterminsticHistogram() {
2525
auto gpair = GenerateRandomGradients(kRows, kLower, kUpper);
2626
gpair.SetDevice(0);
2727

28-
HistogramLaunchConfig histogram_config(80, 256, true);
29-
3028
auto rounding = CreateRoundingFactor<Gradient>(gpair.DeviceSpan());
3129
BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
32-
d_histogram, rounding, histogram_config);
30+
d_histogram, rounding);
3331

3432
for (size_t i = 0; i < kRounds; ++i) {
3533
dh::device_vector<Gradient> new_histogram(kBins * kCols);
3634
auto d_histogram = dh::ToSpan(new_histogram);
3735

3836
auto rounding = CreateRoundingFactor<Gradient>(gpair.DeviceSpan());
3937
BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
40-
d_histogram, rounding, histogram_config);
38+
d_histogram, rounding);
4139

4240
for (size_t j = 0; j < new_histogram.size(); ++j) {
4341
ASSERT_EQ(((Gradient)new_histogram[j]).GetGrad(),
@@ -52,7 +50,7 @@ void TestDeterminsticHistogram() {
5250
gpair.SetDevice(0);
5351
dh::device_vector<Gradient> baseline(kBins * kCols);
5452
BuildGradientHistogram(page->GetDeviceAccessor(0), gpair.DeviceSpan(), ridx,
55-
dh::ToSpan(baseline), rounding, histogram_config);
53+
dh::ToSpan(baseline), rounding);
5654
for (size_t i = 0; i < baseline.size(); ++i) {
5755
EXPECT_NEAR(((Gradient)baseline[i]).GetGrad(), ((Gradient)histogram[i]).GetGrad(),
5856
((Gradient)baseline[i]).GetGrad() * 1e-3);

tests/cpp/tree/test_gpu_hist.cu

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,6 @@ void TestBuildHist(bool use_shared_memory_histograms) {
105105
maker.hist.AllocateHistogram(0);
106106
maker.gpair = gpair.DeviceSpan();
107107

108-
maker.histogram_config = HistogramLaunchConfig(80, 256, use_shared_memory_histograms);
109108
maker.BuildHist(0);
110109
DeviceHistogram<GradientSumT> d_hist = maker.hist;
111110

0 commit comments

Comments
 (0)