Skip to content

Commit 111aed8

Browse files
author
SHVETS, KIRILL
committed
fix failed tests
1 parent 70a9bc4 commit 111aed8

File tree

7 files changed

+303
-223
lines changed

7 files changed

+303
-223
lines changed

src/common/column_matrix.h

Lines changed: 75 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ class ColumnMatrix {
8585
for (int32_t fid = 0; fid < nfeature; ++fid) {
8686
CHECK_LE(gmat.cut.Ptrs()[fid + 1] - gmat.cut.Ptrs()[fid], max_val);
8787
}
88-
bool all_dense = true;
88+
bool all_dense = gmat.IsDense();
8989
gmat.GetFeatureCounts(&feature_counts_[0]);
9090
// classify features
9191
for (int32_t fid = 0; fid < nfeature; ++fid) {
@@ -120,8 +120,9 @@ class ColumnMatrix {
120120
index_.resize(boundary_[nfeature - 1].index_end);
121121
type_size_ = 1 << gmat.index.getBinBound();
122122
index_.resize(boundary_[nfeature - 1].index_end * type_size_);
123-
if(!all_dense)
123+
if (!all_dense) {
124124
row_ind_.resize(boundary_[nfeature - 1].row_ind_end);
125+
}
125126

126127
// store least bin id for each feature
127128
index_base_.resize(nfeature);
@@ -148,10 +149,8 @@ class ColumnMatrix {
148149
num_nonzeros.resize(nfeature);
149150
std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
150151

151-
if(all_dense)
152-
{
153-
switch(gmat.index.getBinBound())
154-
{
152+
if (all_dense) {
153+
switch (gmat.index.getBinBound()) {
155154
case POWER_OF_TWO_8:
156155
SetIndexAllDense(gmat.index.data<uint8_t>(), gmat, nrow);
157156
break;
@@ -162,42 +161,36 @@ class ColumnMatrix {
162161
SetIndexAllDense(gmat.index.data<uint32_t>(), gmat, nrow);
163162
break;
164163
}
165-
}
166-
167-
else
168-
{
169-
switch(gmat.index.getBinBound())
170-
{
164+
} else {
165+
switch (gmat.index.getBinBound()) {
171166
case POWER_OF_TWO_8:
172-
SetIndex(gmat.index.data<uint8_t>(), gmat, nrow, nfeature);
167+
SetIndex(gmat.index.data<uint8_t>(), gmat.index.disp(), gmat, nrow, nfeature);
173168
break;
174169
case POWER_OF_TWO_16:
175-
SetIndex(gmat.index.data<uint16_t>(), gmat, nrow, nfeature);
170+
SetIndex(gmat.index.data<uint16_t>(), gmat.index.disp(), gmat, nrow, nfeature);
176171
break;
177172
case POWER_OF_TWO_32:
178-
SetIndex(gmat.index.data<uint32_t>(), gmat, nrow, nfeature);
173+
SetIndex(gmat.index.data<uint32_t>(), gmat.index.disp(), gmat, nrow, nfeature);
179174
break;
180175
}
181176
}
182-
183-
184177
}
185178

186179
/* Fetch an individual column. This code should be used with XGBOOST_TYPE_SWITCH
187180
to determine type of bin id's */
188181
template <typename T>
189182
inline Column<T> GetColumn(unsigned fid) const {
190-
Column<T> c(type_[fid], (T*)(&index_[boundary_[fid].index_begin * type_size_]), index_base_[fid],
191-
(type_[fid] == ColumnType::kSparseColumn ?
192-
&row_ind_[boundary_[fid].row_ind_begin] : nullptr),
193-
boundary_[fid].index_end - boundary_[fid].index_begin);
183+
Column<T> c(type_[fid],
184+
reinterpret_cast<const T*>(&index_[boundary_[fid].index_begin * type_size_]),
185+
index_base_[fid], (type_[fid] == ColumnType::kSparseColumn ?
186+
&row_ind_[boundary_[fid].row_ind_begin] : nullptr),
187+
boundary_[fid].index_end - boundary_[fid].index_begin);
194188
return c;
195189
}
196190

197191
template<typename T>
198-
inline void SetIndexAllDense(T* index, const GHistIndexMatrix& gmat, const size_t nrow)
199-
{
200-
T* local_index = (T*)(&index_[0]);
192+
inline void SetIndexAllDense(T* index, const GHistIndexMatrix& gmat, const size_t nrow) {
193+
T* local_index = reinterpret_cast<T*>(&index_[0]);
201194
for (size_t rid = 0; rid < nrow; ++rid) {
202195
const size_t ibegin = gmat.row_ptr[rid];
203196
const size_t iend = gmat.row_ptr[rid + 1];
@@ -209,43 +202,91 @@ class ColumnMatrix {
209202
}
210203
}
211204
}
205+
206+
inline void SetIndexAllDense(uint32_t* index, const GHistIndexMatrix& gmat, const size_t nrow) {
207+
uint32_t* local_index = reinterpret_cast<uint32_t*>(&index_[0]);
208+
for (size_t rid = 0; rid < nrow; ++rid) {
209+
const size_t ibegin = gmat.row_ptr[rid];
210+
const size_t iend = gmat.row_ptr[rid + 1];
211+
size_t fid = 0;
212+
size_t jp = 0;
213+
for (size_t i = ibegin; i < iend; ++i, ++jp) {
214+
uint32_t* begin = &local_index[boundary_[jp].index_begin];
215+
begin[rid] = index[i] - index_base_[jp];
216+
}
217+
}
218+
}
219+
212220
template<typename T>
213-
inline void SetIndex(T* index, const GHistIndexMatrix& gmat, const size_t nrow, const size_t nfeature)
214-
{
221+
inline void SetIndex(T* index, uint32_t* disp, const GHistIndexMatrix& gmat,
222+
const size_t nrow, const size_t nfeature) {
215223
std::vector<size_t> num_nonzeros;
216224
num_nonzeros.resize(nfeature);
217225
std::fill(num_nonzeros.begin(), num_nonzeros.end(), 0);
218226

219-
T* local_index = (T*)(&index_[0]);
227+
T* local_index = reinterpret_cast<T*>(&index_[0]);
228+
// std::cout << "\n++local_index before set: \n";
229+
// for(size_t i = 0; i < index_.size()/type_size_; ++i)
230+
// std::cout << local_index[i] << " ";
231+
//
232+
// std::cout << "\n++index_base_:\n";
233+
// for (int32_t fid = 0; fid < index_base_.size(); ++fid) {
234+
// std::cout << index_base_[fid] << " ";
235+
// }
236+
237+
// std::cout << "\n++bin_id: \n";
238+
220239
for (size_t rid = 0; rid < nrow; ++rid) {
221240
const size_t ibegin = gmat.row_ptr[rid];
222241
const size_t iend = gmat.row_ptr[rid + 1];
223242
size_t fid = 0;
224243
size_t jp = 0;
225244
for (size_t i = ibegin; i < iend; ++i) {
226-
const uint32_t bin_id = index[i] + index_base_[jp];
245+
const uint32_t bin_id = index[i] + disp[jp];
246+
// std::cout << bin_id << " ";
227247
auto iter = std::upper_bound(gmat.cut.Ptrs().cbegin() + fid,
228248
gmat.cut.Ptrs().cend(), bin_id);
229249
fid = std::distance(gmat.cut.Ptrs().cbegin(), iter) - 1;
230250
if (type_[fid] == kDenseColumn) {
231-
T* begin = &local_index[boundary_[jp].index_begin];
232-
begin[rid] = index[i];
233-
++jp;
251+
T* begin = &local_index[boundary_[fid].index_begin];
252+
begin[rid] = bin_id - index_base_[fid];
234253
} else {
235254
T* begin = &local_index[boundary_[fid].index_begin];
236-
begin[num_nonzeros[fid]] = index[i];//bin_id - index_base_[fid];
255+
begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
237256
row_ind_[boundary_[fid].row_ind_begin + num_nonzeros[fid]] = rid;
238257
++num_nonzeros[fid];
239258
}
259+
++jp;
260+
}
261+
}
262+
/*
263+
for (size_t rid = 0; rid < nrow; ++rid) {
264+
const size_t ibegin = gmat.row_ptr[rid];
265+
const size_t iend = gmat.row_ptr[rid + 1];
266+
size_t fid = 0;
267+
for (size_t i = ibegin; i < iend; ++i) {
268+
const uint32_t bin_id = gmat.index[i];
269+
auto iter = std::upper_bound(gmat.cut.Ptrs().cbegin() + fid,
270+
gmat.cut.Ptrs().cend(), bin_id);
271+
fid = std::distance(gmat.cut.Ptrs().cbegin(), iter) - 1;
272+
if (type_[fid] == kDenseColumn) {
273+
uint32_t* begin = &index_[boundary_[fid].index_begin];
274+
begin[rid] = bin_id - index_base_[fid];
275+
} else {
276+
uint32_t* begin = &index_[boundary_[fid].index_begin];
277+
begin[num_nonzeros[fid]] = bin_id - index_base_[fid];
278+
row_ind_[boundary_[fid].row_ind_begin + num_nonzeros[fid]] = rid;
279+
++num_nonzeros[fid];
240280
}
241281
}
282+
}*/
242283
}
243-
const size_t GetTypeSize() const
244-
{
284+
const size_t GetTypeSize() const {
245285
return type_size_;
246286
}
247287

248288
private:
289+
std::vector<uint8_t> index_; // index_: may store smaller integers; needs padding
249290
struct ColumnBoundary {
250291
// indicate where each column's index and row_ind is stored.
251292
// index_begin and index_end are logical offsets, so they should be converted to
@@ -258,7 +299,6 @@ class ColumnMatrix {
258299

259300
std::vector<size_t> feature_counts_;
260301
std::vector<ColumnType> type_;
261-
std::vector<uint8_t> index_; // index_: may store smaller integers; needs padding
262302
std::vector<size_t> row_ind_;
263303
std::vector<ColumnBoundary> boundary_;
264304

0 commit comments

Comments
 (0)