Move thread local entry into Learner.

trivialfis · trivialfis · commit 374bd9acac07 · 2020-03-06T14:34:48.000+08:00
Extracted from dmlc#5389 . This is an attempt to workaround CUDA context issue in static variable, where the CUDA context can be released before device vector. * Add PredictionEntry to thread local entry. This eliminates one copy of prediction vector. * Don't define CUDA C API in a namespace.
diff --git a/include/xgboost/learner.h b/include/xgboost/learner.h
@@ -11,6 +11,7 @@
 #include <rabit/rabit.h>
 #include <xgboost/base.h>
 #include <xgboost/feature_map.h>
+#include <xgboost/predictor.h>
 #include <xgboost/generic_parameters.h>
 #include <xgboost/host_device_vector.h>
 #include <xgboost/model.h>
@@ -29,6 +30,22 @@ class ObjFunction;
 class DMatrix;
 class Json;
 
+/*! \brief entry to to easily hold returning information */
+struct XGBAPIThreadLocalEntry {
+  /*! \brief result holder for returning string */
+  std::string ret_str;
+  /*! \brief result holder for returning strings */
+  std::vector<std::string> ret_vec_str;
+  /*! \brief result holder for returning string pointers */
+  std::vector<const char *> ret_vec_charp;
+  /*! \brief returning float vector. */
+  std::vector<bst_float> ret_vec_float;
+  /*! \brief temp variable of gradient pairs. */
+  std::vector<GradientPair> tmp_gpair;
+  PredictionCacheEntry prediction_entry;
+};
+
+
 /*!
  * \brief Learner class that does training and prediction.
  *  This is the user facing module of xgboost training.
@@ -167,6 +184,8 @@ class Learner : public Model, public Configurable, public rabit::Serializable {
   virtual std::vector<std::string> DumpModel(const FeatureMap& fmap,
                                              bool with_stats,
                                              std::string format) const = 0;
+
+  virtual XGBAPIThreadLocalEntry& GetThreadLocal() const = 0;
   /*!
    * \brief Create a new instance of learner.
    * \param cache_data The matrix to cache the prediction.
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
@@ -1,5 +1,4 @@
 // Copyright (c) 2014-2020 by Contributors
-#include <dmlc/thread_local.h>
 #include <rabit/rabit.h>
 #include <rabit/c_api.h>
 
@@ -26,20 +25,6 @@
 
 using namespace xgboost; // NOLINT(*);
 
-/*! \brief entry to to easily hold returning information */
-struct XGBAPIThreadLocalEntry {
-  /*! \brief result holder for returning string */
-  std::string ret_str;
-  /*! \brief result holder for returning strings */
-  std::vector<std::string> ret_vec_str;
-  /*! \brief result holder for returning string pointers */
-  std::vector<const char *> ret_vec_charp;
-  /*! \brief returning float vector. */
-  std::vector<bst_float> ret_vec_float;
-  /*! \brief temp variable of gradient pairs. */
-  std::vector<GradientPair> tmp_gpair;
-};
-
 XGB_DLL void XGBoostVersion(int* major, int* minor, int* patch) {
   if (major) {
     *major = XGBOOST_VER_MAJOR;
@@ -52,9 +37,6 @@ XGB_DLL void XGBoostVersion(int* major, int* minor, int* patch) {
   }
 }
 
-// define the threadlocal store.
-using XGBAPIThreadLocalStore = dmlc::ThreadLocalStore<XGBAPIThreadLocalEntry>;
-
 int XGBRegisterLogCallback(void (*callback)(const char*)) {
   API_BEGIN();
   LogCallbackRegistry* registry = LogCallbackRegistryStore::Get();
@@ -102,16 +84,16 @@ XGB_DLL int XGDMatrixCreateFromArrayInterfaceColumns(char const* c_json_strs,
                                                      int nthread,
                                                      DMatrixHandle* out) {
   API_BEGIN();
-  LOG(FATAL) << "Xgboost not compiled with cuda";
+  LOG(FATAL) << "XGBoost not compiled with CUDA";
   API_END();
 }
 
 XGB_DLL int XGDMatrixCreateFromArrayInterface(char const* c_json_strs,
-                                                     bst_float missing,
-                                                     int nthread,
-                                                     DMatrixHandle* out) {
+                                              bst_float missing,
+                                              int nthread,
+                                              DMatrixHandle* out) {
   API_BEGIN();
-  LOG(FATAL) << "Xgboost not compiled with cuda";
+  LOG(FATAL) << "XGBoost not compiled with CUDA";
   API_END();
 }
 
@@ -375,7 +357,7 @@ XGB_DLL int XGBoosterSaveJsonConfig(BoosterHandle handle,
   auto* learner = static_cast<Learner*>(handle);
   learner->Configure();
   learner->SaveConfig(&config);
-  std::string& raw_str = XGBAPIThreadLocalStore::Get()->ret_str;
+  std::string& raw_str = learner->GetThreadLocal().ret_str;
   Json::Dump(config, &raw_str);
   *out_str = raw_str.c_str();
   *out_len = static_cast<xgboost::bst_ulong>(raw_str.length());
@@ -422,10 +404,11 @@ XGB_DLL int XGBoosterEvalOneIter(BoosterHandle handle,
                                  const char* evnames[],
                                  xgboost::bst_ulong len,
                                  const char** out_str) {
-  std::string& eval_str = XGBAPIThreadLocalStore::Get()->ret_str;
   API_BEGIN();
   CHECK_HANDLE();
   auto* bst = static_cast<Learner*>(handle);
+  std::string& eval_str = bst->GetThreadLocal().ret_str;
+
   std::vector<std::shared_ptr<DMatrix>> data_sets;
   std::vector<std::string> data_names;
 
@@ -446,24 +429,22 @@ XGB_DLL int XGBoosterPredict(BoosterHandle handle,
                              int32_t training,
                              xgboost::bst_ulong *len,
                              const bst_float **out_result) {
-  std::vector<bst_float>& preds =
-      XGBAPIThreadLocalStore::Get()->ret_vec_float;
   API_BEGIN();
   CHECK_HANDLE();
-  auto *bst = static_cast<Learner*>(handle);
+  auto *learner = static_cast<Learner*>(handle);
+  auto& entry = learner->GetThreadLocal().prediction_entry;
   HostDeviceVector<bst_float> tmp_preds;
-  bst->Predict(
+  learner->Predict(
       *static_cast<std::shared_ptr<DMatrix>*>(dmat),
       (option_mask & 1) != 0,
-      &tmp_preds, ntree_limit,
+      &entry.predictions, ntree_limit,
       static_cast<bool>(training),
       (option_mask & 2) != 0,
       (option_mask & 4) != 0,
       (option_mask & 8) != 0,
       (option_mask & 16) != 0);
-  preds = tmp_preds.HostVector();
-  *out_result = dmlc::BeginPtr(preds);
-  *len = static_cast<xgboost::bst_ulong>(preds.size());
+  *out_result = dmlc::BeginPtr(entry.predictions.ConstHostVector());
+  *len = static_cast<xgboost::bst_ulong>(entry.predictions.Size());
   API_END();
 }
 
@@ -515,13 +496,14 @@ XGB_DLL int XGBoosterLoadModelFromBuffer(BoosterHandle handle,
 XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle,
                                  xgboost::bst_ulong* out_len,
                                  const char** out_dptr) {
-  std::string& raw_str = XGBAPIThreadLocalStore::Get()->ret_str;
-  raw_str.resize(0);
-
   API_BEGIN();
   CHECK_HANDLE();
-  common::MemoryBufferStream fo(&raw_str);
   auto *learner = static_cast<Learner*>(handle);
+  std::string& raw_str = learner->GetThreadLocal().ret_str;
+  raw_str.resize(0);
+
+  common::MemoryBufferStream fo(&raw_str);
+
   learner->Configure();
   learner->SaveModel(&fo);
   *out_dptr = dmlc::BeginPtr(raw_str);
@@ -534,13 +516,12 @@ XGB_DLL int XGBoosterGetModelRaw(BoosterHandle handle,
 XGB_DLL int XGBoosterSerializeToBuffer(BoosterHandle handle,
                                        xgboost::bst_ulong *out_len,
                                        const char **out_dptr) {
-  std::string &raw_str = XGBAPIThreadLocalStore::Get()->ret_str;
-  raw_str.resize(0);
-
   API_BEGIN();
   CHECK_HANDLE();
-  common::MemoryBufferStream fo(&raw_str);
   auto *learner = static_cast<Learner*>(handle);
+  std::string &raw_str = learner->GetThreadLocal().ret_str;
+  raw_str.resize(0);
+  common::MemoryBufferStream fo(&raw_str);
   learner->Configure();
   learner->Save(&fo);
   *out_dptr = dmlc::BeginPtr(raw_str);
@@ -583,16 +564,13 @@ XGB_DLL int XGBoosterSaveRabitCheckpoint(BoosterHandle handle) {
   API_END();
 }
 
-inline void XGBoostDumpModelImpl(
-    BoosterHandle handle,
-    const FeatureMap& fmap,
-    int with_stats,
-    const char *format,
-    xgboost::bst_ulong* len,
-    const char*** out_models) {
-  std::vector<std::string>& str_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_str;
-  std::vector<const char*>& charp_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_charp;
+inline void XGBoostDumpModelImpl(BoosterHandle handle, const FeatureMap &fmap,
+                                 int with_stats, const char *format,
+                                 xgboost::bst_ulong *len,
+                                 const char ***out_models) {
   auto *bst = static_cast<Learner*>(handle);
+  std::vector<std::string>& str_vecs = bst->GetThreadLocal().ret_vec_str;
+  std::vector<const char*>& charp_vecs = bst->GetThreadLocal().ret_vec_charp;
   bst->Configure();
   str_vecs = bst->DumpModel(fmap, with_stats != 0, format);
   charp_vecs.resize(str_vecs.size());
@@ -608,7 +586,10 @@ XGB_DLL int XGBoosterDumpModel(BoosterHandle handle,
                                int with_stats,
                                xgboost::bst_ulong* len,
                                const char*** out_models) {
+  API_BEGIN();
+  CHECK_HANDLE();
   return XGBoosterDumpModelEx(handle, fmap, with_stats, "text", len, out_models);
+  API_END();
 }
 
 XGB_DLL int XGBoosterDumpModelEx(BoosterHandle handle,
@@ -664,7 +645,7 @@ XGB_DLL int XGBoosterGetAttr(BoosterHandle handle,
                      const char** out,
                      int* success) {
   auto* bst = static_cast<Learner*>(handle);
-  std::string& ret_str = XGBAPIThreadLocalStore::Get()->ret_str;
+  std::string& ret_str = bst->GetThreadLocal().ret_str;
   API_BEGIN();
   CHECK_HANDLE();
   if (bst->GetAttr(key, &ret_str)) {
@@ -680,9 +661,9 @@ XGB_DLL int XGBoosterGetAttr(BoosterHandle handle,
 XGB_DLL int XGBoosterSetAttr(BoosterHandle handle,
                              const char* key,
                              const char* value) {
-  auto* bst = static_cast<Learner*>(handle);
   API_BEGIN();
   CHECK_HANDLE();
+  auto* bst = static_cast<Learner*>(handle);
   if (value == nullptr) {
     bst->DelAttr(key);
   } else {
@@ -694,12 +675,13 @@ XGB_DLL int XGBoosterSetAttr(BoosterHandle handle,
 XGB_DLL int XGBoosterGetAttrNames(BoosterHandle handle,
                                   xgboost::bst_ulong* out_len,
                                   const char*** out) {
-  std::vector<std::string>& str_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_str;
-  std::vector<const char*>& charp_vecs = XGBAPIThreadLocalStore::Get()->ret_vec_charp;
-  auto *bst = static_cast<Learner*>(handle);
   API_BEGIN();
   CHECK_HANDLE();
-  str_vecs = bst->GetAttrNames();
+  auto *learner = static_cast<Learner *>(handle);
+  std::vector<std::string> &str_vecs = learner->GetThreadLocal().ret_vec_str;
+  std::vector<const char *> &charp_vecs =
+      learner->GetThreadLocal().ret_vec_charp;
+  str_vecs = learner->GetAttrNames();
   charp_vecs.resize(str_vecs.size());
   for (size_t i = 0; i < str_vecs.size(); ++i) {
     charp_vecs[i] = str_vecs[i].c_str();
diff --git a/src/c_api/c_api.cu b/src/c_api/c_api.cu
@@ -1,11 +1,12 @@
-// Copyright (c) 2014-2019 by Contributors
-
+// Copyright (c) 2019-2020 by Contributors
 #include "xgboost/data.h"
 #include "xgboost/c_api.h"
+#include "xgboost/learner.h"
 #include "c_api_error.h"
 #include "../data/device_adapter.cuh"
 
-namespace xgboost {
+using namespace xgboost;  // NOLINT
+
 XGB_DLL int XGDMatrixCreateFromArrayInterfaceColumns(char const* c_json_strs,
                                                      bst_float missing,
                                                      int nthread,
@@ -28,5 +29,3 @@ XGB_DLL int XGDMatrixCreateFromArrayInterface(char const* c_json_strs,
       new std::shared_ptr<DMatrix>(DMatrix::Create(&adapter, missing, nthread));
   API_END();
 }
-
-}  // namespace xgboost
diff --git a/src/common/transform.h b/src/common/transform.h
@@ -105,6 +105,17 @@ class Transform {
       return Span<T const> {_vec->ConstHostPointer(),
             static_cast<typename Span<T>::index_type>(_vec->Size())};
     }
+    // Recursive sync host
+    template <typename T>
+    void SyncHost(const HostDeviceVector<T> *_vector) const {
+      _vector->ConstHostPointer();
+    }
+    template <typename Head, typename... Rest>
+    void SyncHost(const HostDeviceVector<Head> *_vector,
+                  const HostDeviceVector<Rest> *... _vectors) const {
+      _vector->ConstHostPointer();
+      SyncHost(_vectors...);
+    }
     // Recursive unpack for Shard.
     template <typename T>
     void UnpackShard(int device, const HostDeviceVector<T> *vector) const {
@@ -154,6 +165,7 @@ class Transform {
     void LaunchCPU(Functor func, HDV*... vectors) const {
       omp_ulong end = static_cast<omp_ulong>(*(range_.end()));
       dmlc::OMPException omp_exc;
+      SyncHost(vectors...);
 #pragma omp parallel for schedule(static)
       for (omp_ulong idx = 0; idx < end; ++idx) {
         omp_exc.Run(func, idx, UnpackHDV(vectors)...);
diff --git a/src/learner.cc b/src/learner.cc
@@ -205,6 +205,9 @@ class LearnerImpl : public Learner {
       cache_.Cache(d, GenericParameter::kCpuId);
     }
   }
+  ~LearnerImpl() override {
+    local_map.erase(this);
+  }
   // Configuration before data is known.
   void Configure() override {
     if (!this->need_configuration_) { return; }
@@ -873,6 +876,9 @@ class LearnerImpl : public Learner {
     }
   }
 
+  XGBAPIThreadLocalEntry& GetThreadLocal() const override {
+    return local_map[this];
+  }
   const std::map<std::string, std::string>& GetConfigurationArguments() const override {
     return cfg_;
   }
@@ -1017,6 +1023,7 @@ class LearnerImpl : public Learner {
   // gradient pairs
   HostDeviceVector<GradientPair> gpair_;
   bool need_configuration_;
+  static thread_local std::map<LearnerImpl const *, XGBAPIThreadLocalEntry> local_map;
 
  private:
   /*! \brief random number transformation seed. */
@@ -1037,6 +1044,8 @@ std::string const LearnerImpl::kEvalMetric {"eval_metric"};  // NOLINT
 
 constexpr int32_t LearnerImpl::kRandSeedMagic;
 
+thread_local std::map<LearnerImpl const *, XGBAPIThreadLocalEntry> LearnerImpl::local_map;
+
 Learner* Learner::Create(
     const std::vector<std::shared_ptr<DMatrix> >& cache_data) {
   return new LearnerImpl(cache_data);
diff --git a/tests/python-gpu/test_from_columnar.py b/tests/python-gpu/test_from_columnar.py
@@ -99,7 +99,7 @@ def test_cudf_training(self):
         evals_result_cudf = {}
         dtrain_cudf = xgb.DMatrix(df.from_pandas(X), df.from_pandas(y), weight=cudf_weights,
                                   base_margin=cudf_base_margin)
-        params = {'gpu_id': 0, 'nthread': 1}
+        params = {'gpu_id': 0}
         xgb.train(params, dtrain_cudf, evals=[(dtrain_cudf, "train")],
                   evals_result=evals_result_cudf)
         evals_result_np = {}