Skip to content

Commit 374bd9a

Browse files
committed
Move thread local entry into Learner.
Extracted from dmlc#5389 . This is an attempt to workaround CUDA context issue in static variable, where the CUDA context can be released before device vector. * Add PredictionEntry to thread local entry. This eliminates one copy of prediction vector. * Don't define CUDA C API in a namespace.
1 parent 8d06878 commit 374bd9a

File tree

6 files changed

+83
-62
lines changed

6 files changed

+83
-62
lines changed

include/xgboost/learner.h

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <rabit/rabit.h>
1212
#include <xgboost/base.h>
1313
#include <xgboost/feature_map.h>
14+
#include <xgboost/predictor.h>
1415
#include <xgboost/generic_parameters.h>
1516
#include <xgboost/host_device_vector.h>
1617
#include <xgboost/model.h>
@@ -29,6 +30,22 @@ class ObjFunction;
2930
class DMatrix;
3031
class Json;
3132

33+
/*! \brief entry to to easily hold returning information */
34+
struct XGBAPIThreadLocalEntry {
35+
/*! \brief result holder for returning string */
36+
std::string ret_str;
37+
/*! \brief result holder for returning strings */
38+
std::vector<std::string> ret_vec_str;
39+
/*! \brief result holder for returning string pointers */
40+
std::vector<const char *> ret_vec_charp;
41+
/*! \brief returning float vector. */
42+
std::vector<bst_float> ret_vec_float;
43+
/*! \brief temp variable of gradient pairs. */
44+
std::vector<GradientPair> tmp_gpair;
45+
PredictionCacheEntry prediction_entry;
46+
};
47+
48+
3249
/*!
3350
* \brief Learner class that does training and prediction.
3451
* This is the user facing module of xgboost training.
@@ -167,6 +184,8 @@ class Learner : public Model, public Configurable, public rabit::Serializable {
167184
virtual std::vector<std::string> DumpModel(const FeatureMap& fmap,
168185
bool with_stats,
169186
std::string format) const = 0;
187+
188+
virtual XGBAPIThreadLocalEntry& GetThreadLocal() const = 0;
170189
/*!
171190
* \brief Create a new instance of learner.
172191
* \param cache_data The matrix to cache the prediction.

src/c_api/c_api.cc

Lines changed: 38 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
// Copyright (c) 2014-2020 by Contributors
2-
#include <dmlc/thread_local.h>
32
#include <rabit/rabit.h>
43
#include <rabit/c_api.h>
54

@@ -26,20 +25,6 @@
2625

2726
using namespace xgboost; // NOLINT(*);
2827

29-
/*! \brief entry to to easily hold returning information */
30-
struct XGBAPIThreadLocalEntry {
31-
/*! \brief result holder for returning string */
32-
std::string ret_str;
33-
/*! \brief result holder for returning strings */
34-
std::vector<std::string> ret_vec_str;
35-
/*! \brief result holder for returning string pointers */
36-
std::vector<const char *> ret_vec_charp;
37-
/*! \brief returning float vector. */
38-
std::vector<bst_float> ret_vec_float;
39-
/*! \brief temp variable of gradient pairs. */
40-
std::vector<GradientPair> tmp_gpair;
41-
};
42-
4328
XGB_DLL void XGBoostVersion(int* major, int* minor, int* patch) {
4429
if (major) {
4530
*major = XGBOOST_VER_MAJOR;
@@ -52,9 +37,6 @@ XGB_DLL void XGBoostVersion(int* major, int* minor, int* patch) {
5237
}
5338
}
5439

55-
// define the threadlocal store.
56-
using XGBAPIThreadLocalStore = dmlc::ThreadLocalStore<XGBAPIThreadLocalEntry>;
57-
5840
int XGBRegisterLogCallback(void (*callback)(const char*)) {
5941
API_BEGIN();
6042
LogCallbackRegistry* registry = LogCallbackRegistryStore::Get();
@@ -102,16 +84,16 @@ XGB_DLL int XGDMatrixCreateFromArrayInterfaceColumns(char const* c_json_strs,
10284
int nthread,
10385
DMatrixHandle* out) {
10486
API_BEGIN();
105-
LOG(FATAL) << "Xgboost not compiled with cuda";
87+
LOG(FATAL) << "XGBoost not compiled with CUDA";
10688
API_END();
10789
}
10890

10991
XGB_DLL int XGDMatrixCreateFromArrayInterface(char const* c_json_strs,
110-
bst_float missing,
111-
int nthread,
112-
DMatrixHandle* out) {
92+
bst_float missing,
93+
int nthread,
94+
DMatrixHandle* out) {
11395
API_BEGIN();
114-
LOG(FATAL) << "Xgboost not compiled with cuda";
96+
LOG(FATAL) << "XGBoost not compiled with CUDA";
11597
API_END();
11698
}
11799

@@ -375,7 +357,7 @@ XGB_DLL int XGBoosterSaveJsonConfig(BoosterHandle handle,
375357
auto* learner = static_cast<Learner*>(handle);
376358
learner->Configure();
377359
learner->SaveConfig(&config);
378-
std::string& raw_str = XGBAPIThreadLocalStore::Get()->ret_str;
360+
std::string& raw_str = learner->GetThreadLocal().ret_str;
379361
Json::Dump(config, &raw_str);
380362
*out_str = raw_str.c_str();
381363
*out_len = static_cast<xgboost::bst_ulong>(raw_str.length());
@@ -422,10 +404,11 @@ XGB_DLL int XGBoosterEvalOneIter(BoosterHandle handle,
422404
const char* evnames[],
423405
xgboost::bst_ulong len,
424406
const char** out_str) {
425-
std::string& eval_str = XGBAPIThreadLocalStore::Get()->ret_str;
426407
API_BEGIN();
427408
CHECK_HANDLE();
428409
auto* bst = static_cast<Learner*>(handle);
410+
std::string& eval_str = bst->GetThreadLocal().ret_str;
411+
429412
std::vector<std::shared_ptr<DMatrix>> data_sets;
430413
std::vector<std::string> data_names;
431414

@@ -446,24 +429,22 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
446429
int32_t training,
447430
xgboost::bst_ulong *len,
448431
const bst_float **out_result) {
449-
std::vector<bst_float>& preds =
450-
XGBAPIThreadLocalStore::Get()->ret_vec_float;
451432
API_BEGIN();
452433
CHECK_HANDLE();
453-
auto *bst = static_cast<Learner*>(handle);
434+
auto *learner = static_cast<Learner*>(handle);
435+
auto& entry = learner->GetThreadLocal().prediction_entry;
454436
HostDeviceVector<bst_float> tmp_preds;
455-
bst->Predict(
437+
learner->Predict(
456438
*static_cast<std::shared_ptr<DMatrix>*>(dmat),
457439
(option_mask & 1) != 0,
458-
&tmp_preds, ntree_limit,
440+
&entry.predictions, ntree_limit,
459441
static_cast<bool>(training),
460442
(option_mask & 2) != 0,
461443
(option_mask & 4) != 0,
462444
(option_mask & 8) != 0,
463445
(option_mask & 16) != 0);
464-
preds = tmp_preds.HostVector();
465-
*out_result = dmlc::BeginPtr(preds);
466-
*len = static_cast<xgboost::bst_ulong>(preds.size());
446+
*out_result = dmlc::BeginPtr(entry.predictions.ConstHostVector());
447+
*len = static_cast<xgboost::bst_ulong>(entry.predictions.Size());
467448
API_END();
468449
}
469450

@@ -515,13 +496,14 @@ XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle,
515496
XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle,
516497
xgboost::bst_ulong* out_len,
517498
const char** out_dptr) {
518-
std::string& raw_str = XGBAPIThreadLocalStore::Get()->ret_str;
519-
raw_str.resize(0);
520-
521499
API_BEGIN();
522500
CHECK_HANDLE();
523-
common::MemoryBufferStream fo(&raw_str);
524501
auto *learner = static_cast<Learner*>(handle);
502+
std::string& raw_str = learner->GetThreadLocal().ret_str;
503+
raw_str.resize(0);
504+
505+
common::MemoryBufferStream fo(&raw_str);
506+
525507
learner->Configure();
526508
learner->SaveModel(&fo);
527509
*out_dptr = dmlc::BeginPtr(raw_str);
@@ -534,13 +516,12 @@ XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle,
534516
XGB_DLL int XGBoosterSerializeToBuffer(BoosterHandle handle,
535517
xgboost::bst_ulong *out_len,
536518
const char **out_dptr) {
537-
std::string &raw_str = XGBAPIThreadLocalStore::Get()->ret_str;
538-
raw_str.resize(0);
539-
540519
API_BEGIN();
541520
CHECK_HANDLE();
542-
common::MemoryBufferStream fo(&raw_str);
543521
auto *learner = static_cast<Learner*>(handle);
522+
std::string &raw_str = learner->GetThreadLocal().ret_str;
523+
raw_str.resize(0);
524+
common::MemoryBufferStream fo(&raw_str);
544525
learner->Configure();
545526
learner->Save(&fo);
546527
*out_dptr = dmlc::BeginPtr(raw_str);
@@ -583,16 +564,13 @@ XGB_DLL int XGBoosterSaveRabitCheckpoint(BoosterHandle handle) {
583564
API_END();
584565
}
585566

586-
inline void XGBoostDumpModelImpl(
587-
BoosterHandle handle,
588-
const FeatureMap& fmap,
589-
int with_stats,
590-
const char *format,
591-
xgboost::bst_ulong* len,
592-
const char*** out_models) {
593-
std::vector<std::string>& str_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_str;
594-
std::vector<const char*>& charp_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_charp;
567+
inline void XGBoostDumpModelImpl(BoosterHandle handle, const FeatureMap &fmap,
568+
int with_stats, const char *format,
569+
xgboost::bst_ulong *len,
570+
const char ***out_models) {
595571
auto *bst = static_cast<Learner*>(handle);
572+
std::vector<std::string>& str_vecs = bst->GetThreadLocal().ret_vec_str;
573+
std::vector<const char*>& charp_vecs = bst->GetThreadLocal().ret_vec_charp;
596574
bst->Configure();
597575
str_vecs = bst->DumpModel(fmap, with_stats != 0, format);
598576
charp_vecs.resize(str_vecs.size());
@@ -608,7 +586,10 @@ XGB_DLL int XGBoosterDumpModel(BoosterHandle handle,
608586
int with_stats,
609587
xgboost::bst_ulong* len,
610588
const char*** out_models) {
589+
API_BEGIN();
590+
CHECK_HANDLE();
611591
return XGBoosterDumpModelEx(handle, fmap, with_stats, "text", len, out_models);
592+
API_END();
612593
}
613594

614595
XGB_DLL int XGBoosterDumpModelEx(BoosterHandle handle,
@@ -664,7 +645,7 @@ XGB_DLL int XGBoosterGetAttr(BoosterHandle handle,
664645
const char** out,
665646
int* success) {
666647
auto* bst = static_cast<Learner*>(handle);
667-
std::string& ret_str = XGBAPIThreadLocalStore::Get()->ret_str;
648+
std::string& ret_str = bst->GetThreadLocal().ret_str;
668649
API_BEGIN();
669650
CHECK_HANDLE();
670651
if (bst->GetAttr(key, &ret_str)) {
@@ -680,9 +661,9 @@ XGB_DLL int XGBoosterGetAttr(BoosterHandle handle,
680661
XGB_DLL int XGBoosterSetAttr(BoosterHandle handle,
681662
const char* key,
682663
const char* value) {
683-
auto* bst = static_cast<Learner*>(handle);
684664
API_BEGIN();
685665
CHECK_HANDLE();
666+
auto* bst = static_cast<Learner*>(handle);
686667
if (value == nullptr) {
687668
bst->DelAttr(key);
688669
} else {
@@ -694,12 +675,13 @@ XGB_DLL int XGBoosterSetAttr(BoosterHandle handle,
694675
XGB_DLL int XGBoosterGetAttrNames(BoosterHandle handle,
695676
xgboost::bst_ulong* out_len,
696677
const char*** out) {
697-
std::vector<std::string>& str_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_str;
698-
std::vector<const char*>& charp_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_charp;
699-
auto *bst = static_cast<Learner*>(handle);
700678
API_BEGIN();
701679
CHECK_HANDLE();
702-
str_vecs = bst->GetAttrNames();
680+
auto *learner = static_cast<Learner *>(handle);
681+
std::vector<std::string> &str_vecs = learner->GetThreadLocal().ret_vec_str;
682+
std::vector<const char *> &charp_vecs =
683+
learner->GetThreadLocal().ret_vec_charp;
684+
str_vecs = learner->GetAttrNames();
703685
charp_vecs.resize(str_vecs.size());
704686
for (size_t i = 0; i < str_vecs.size(); ++i) {
705687
charp_vecs[i] = str_vecs[i].c_str();

src/c_api/c_api.cu

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
1-
// Copyright (c) 2014-2019 by Contributors
2-
1+
// Copyright (c) 2019-2020 by Contributors
32
#include "xgboost/data.h"
43
#include "xgboost/c_api.h"
4+
#include "xgboost/learner.h"
55
#include "c_api_error.h"
66
#include "../data/device_adapter.cuh"
77

8-
namespace xgboost {
8+
using namespace xgboost; // NOLINT
9+
910
XGB_DLL int XGDMatrixCreateFromArrayInterfaceColumns(char const* c_json_strs,
1011
bst_float missing,
1112
int nthread,
@@ -28,5 +29,3 @@ XGB_DLL int XGDMatrixCreateFromArrayInterface(char const* c_json_strs,
2829
new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, nthread));
2930
API_END();
3031
}
31-
32-
} // namespace xgboost

src/common/transform.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,17 @@ class Transform {
105105
return Span<T const> {_vec->ConstHostPointer(),
106106
static_cast<typename Span<T>::index_type>(_vec->Size())};
107107
}
108+
// Recursive sync host
109+
template <typename T>
110+
void SyncHost(const HostDeviceVector<T> *_vector) const {
111+
_vector->ConstHostPointer();
112+
}
113+
template <typename Head, typename... Rest>
114+
void SyncHost(const HostDeviceVector<Head> *_vector,
115+
const HostDeviceVector<Rest> *... _vectors) const {
116+
_vector->ConstHostPointer();
117+
SyncHost(_vectors...);
118+
}
108119
// Recursive unpack for Shard.
109120
template <typename T>
110121
void UnpackShard(int device, const HostDeviceVector<T> *vector) const {
@@ -154,6 +165,7 @@ class Transform {
154165
void LaunchCPU(Functor func, HDV*... vectors) const {
155166
omp_ulong end = static_cast<omp_ulong>(*(range_.end()));
156167
dmlc::OMPException omp_exc;
168+
SyncHost(vectors...);
157169
#pragma omp parallel for schedule(static)
158170
for (omp_ulong idx = 0; idx < end; ++idx) {
159171
omp_exc.Run(func, idx, UnpackHDV(vectors)...);

src/learner.cc

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,9 @@ class LearnerImpl : public Learner {
205205
cache_.Cache(d, GenericParameter::kCpuId);
206206
}
207207
}
208+
~LearnerImpl() override {
209+
local_map.erase(this);
210+
}
208211
// Configuration before data is known.
209212
void Configure() override {
210213
if (!this->need_configuration_) { return; }
@@ -873,6 +876,9 @@ class LearnerImpl : public Learner {
873876
}
874877
}
875878

879+
XGBAPIThreadLocalEntry& GetThreadLocal() const override {
880+
return local_map[this];
881+
}
876882
const std::map<std::string, std::string>& GetConfigurationArguments() const override {
877883
return cfg_;
878884
}
@@ -1017,6 +1023,7 @@ class LearnerImpl : public Learner {
10171023
// gradient pairs
10181024
HostDeviceVector<GradientPair> gpair_;
10191025
bool need_configuration_;
1026+
static thread_local std::map<LearnerImpl const *, XGBAPIThreadLocalEntry> local_map;
10201027

10211028
private:
10221029
/*! \brief random number transformation seed. */
@@ -1037,6 +1044,8 @@ std::string const LearnerImpl::kEvalMetric {"eval_metric"}; // NOLINT
10371044

10381045
constexpr int32_t LearnerImpl::kRandSeedMagic;
10391046

1047+
thread_local std::map<LearnerImpl const *, XGBAPIThreadLocalEntry> LearnerImpl::local_map;
1048+
10401049
Learner* Learner::Create(
10411050
const std::vector<std::shared_ptr<DMatrix> >& cache_data) {
10421051
return new LearnerImpl(cache_data);

tests/python-gpu/test_from_columnar.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def test_cudf_training(self):
9999
evals_result_cudf = {}
100100
dtrain_cudf = xgb.DMatrix(df.from_pandas(X), df.from_pandas(y), weight=cudf_weights,
101101
base_margin=cudf_base_margin)
102-
params = {'gpu_id': 0, 'nthread': 1}
102+
params = {'gpu_id': 0}
103103
xgb.train(params, dtrain_cudf, evals=[(dtrain_cudf, "train")],
104104
evals_result=evals_result_cudf)
105105
evals_result_np = {}

0 commit comments

Comments
 (0)