Skip to content

Commit

Permalink
[EM] Multi-level quantile sketching for GPU. (#10813)
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis authored Sep 10, 2024
1 parent 3ef8383 commit ed5f33d
Show file tree
Hide file tree
Showing 6 changed files with 111 additions and 34 deletions.
12 changes: 8 additions & 4 deletions demo/guide-python/quantile_data_iterator.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,17 +105,21 @@ def main():
assert m_with_it.num_row() == m.num_row()
# Tree meethod must be `hist`.
reg_with_it = xgboost.train(
{"tree_method": "hist", "device": "cuda"}, m_with_it, num_boost_round=rounds
{"tree_method": "hist", "device": "cuda"},
m_with_it,
num_boost_round=rounds,
evals=[(m_with_it, "Train")],
)
predict_with_it = reg_with_it.predict(m_with_it)

reg = xgboost.train(
{"tree_method": "hist", "device": "cuda"}, m, num_boost_round=rounds
{"tree_method": "hist", "device": "cuda"},
m,
num_boost_round=rounds,
evals=[(m, "Train")],
)
predict = reg.predict(m)

numpy.testing.assert_allclose(predict_with_it, predict, rtol=1e6)


if __name__ == "__main__":
main()
23 changes: 18 additions & 5 deletions src/common/quantile.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -143,17 +143,30 @@ class SketchContainer {
*/
void Push(Context const* ctx, Span<Entry const> entries, Span<size_t> columns_ptr,
common::Span<OffsetT> cuts_ptr, size_t total_cuts, Span<float> weights = {});
/* \brief Prune the quantile structure.
/**
* @brief Prune the quantile structure.
*
* \param to The maximum size of pruned quantile. If the size of quantile
* structure is already less than `to`, then no operation is performed.
* @param to The maximum size of pruned quantile. If the size of quantile structure is
* already less than `to`, then no operation is performed.
*/
void Prune(Context const* ctx, size_t to);
/* \brief Merge another set of sketch.
* \param that columns of other.
/**
* @brief Merge another set of sketch.
*
* @param that_columns_ptr Column pointer of the quantile summary being merged.
* @param that Columns of the other quantile summary.
*/
void Merge(Context const* ctx, Span<OffsetT const> that_columns_ptr,
Span<SketchEntry const> that);
/**
* @brief Shrink the internal data structure to reduce memory usage. Can be used after
* prune.
*/
void ShrinkToFit() {
this->Current().shrink_to_fit();
this->Other().clear();
this->Other().shrink_to_fit();
}

/* \brief Merge quantiles from other GPU workers. */
void AllReduce(Context const* ctx, bool is_column_split);
Expand Down
98 changes: 79 additions & 19 deletions src/data/quantile_dmatrix.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
*/
#include <algorithm> // for max
#include <numeric> // for partial_sum
#include <utility> // for pair
#include <vector> // for vector

#include "../collective/allreduce.h" // for Allreduce
Expand All @@ -29,11 +30,39 @@ void MakeSketches(Context const* ctx,
float missing, std::shared_ptr<common::HistogramCuts> cuts, MetaInfo const& info,
ExternalDataInfo* p_ext_info) {
xgboost_NVTX_FN_RANGE();

std::unique_ptr<common::SketchContainer> sketch;
/**
* A variant of: A Fast Algorithm for Approximate Quantiles in High Speed Data Streams
*
* The original algorithm was designed for CPU where input is a stream with individual
* elements. For GPU, we process the data in batches. As a result, the implementation
* here simply uses the user input batch as the basic unit of sketching blocks. The
* number of blocks per-level grows exponentially.
*/
std::vector<std::pair<std::unique_ptr<common::SketchContainer>, bst_idx_t>> sketches;
auto& ext_info = *p_ext_info;

auto lazy_init_sketch = [&] {
// Lazy because we need the `n_features`.
sketches.emplace_back(std::make_unique<common::SketchContainer>(
proxy->Info().feature_types, p.max_bin, ext_info.n_features,
data::BatchSamples(proxy), dh::GetDevice(ctx)),
0);
};

// Workaround empty input with CPU ctx.
Context new_ctx;
Context const* p_ctx;
if (ctx->IsCUDA()) {
p_ctx = ctx;
} else {
new_ctx.UpdateAllowUnknown(Args{{"device", dh::GetDevice(ctx).Name()}});
p_ctx = &new_ctx;
}

do {
/**
* Get the data shape.
*/
// We use do while here as the first batch is fetched in ctor
CHECK_LT(ctx->Ordinal(), common::AllVisibleGPUs());
common::SetDevice(dh::GetDevice(ctx).ordinal);
Expand All @@ -46,28 +75,38 @@ void MakeSketches(Context const* ctx,
CHECK_EQ(ext_info.n_features, ::xgboost::data::BatchColumns(proxy))
<< "Inconsistent number of columns.";
}

auto batch_rows = data::BatchSamples(proxy);
ext_info.accumulated_rows += batch_rows;

/**
* Handle sketching.
*/
if (!ref) {
if (!sketch) {
sketch = std::make_unique<common::SketchContainer>(
proxy->Info().feature_types, p.max_bin, ext_info.n_features, data::BatchSamples(proxy),
dh::GetDevice(ctx));
if (sketches.empty()) {
lazy_init_sketch();
}
if (sketches.back().second > (1ul << (sketches.size() - 1))) {
auto n_cuts_per_feat =
common::detail::RequiredSampleCutsPerColumn(p.max_bin, ext_info.accumulated_rows);
// Prune to a single block
sketches.back().first->Prune(p_ctx, n_cuts_per_feat);
sketches.back().first->ShrinkToFit();

sketches.back().second = 1;
lazy_init_sketch(); // Add a new level.
}
proxy->Info().weights_.SetDevice(dh::GetDevice(ctx));
cuda_impl::Dispatch(proxy, [&](auto const& value) {
// Workaround empty input with CPU ctx.
Context new_ctx;
Context const* p_ctx;
if (ctx->IsCUDA()) {
p_ctx = ctx;
} else {
new_ctx.UpdateAllowUnknown(Args{{"device", dh::GetDevice(ctx).Name()}});
p_ctx = &new_ctx;
}
common::AdapterDeviceSketch(p_ctx, value, p.max_bin, proxy->Info(), missing, sketch.get());
common::AdapterDeviceSketch(p_ctx, value, p.max_bin, proxy->Info(), missing,
sketches.back().first.get());
sketches.back().second++;
});
}
auto batch_rows = data::BatchSamples(proxy);
ext_info.accumulated_rows += batch_rows;

/**
* Rest of the data shape.
*/
dh::device_vector<size_t> row_counts(batch_rows + 1, 0);
common::Span<size_t> row_counts_span(row_counts.data().get(), row_counts.size());
ext_info.row_stride =
Expand All @@ -87,7 +126,28 @@ void MakeSketches(Context const* ctx,
// Get reference
common::SetDevice(dh::GetDevice(ctx).ordinal);
if (!ref) {
sketch->MakeCuts(ctx, cuts.get(), info.IsColumnSplit());
HostDeviceVector<FeatureType> ft;
common::SketchContainer final_sketch(
sketches.empty() ? ft : sketches.front().first->FeatureTypes(), p.max_bin,
ext_info.n_features, ext_info.accumulated_rows, dh::GetDevice(ctx));
// Reverse order since the last container might contain summary that's not yet pruned.
for (auto it = sketches.crbegin(); it != sketches.crend(); ++it) {
auto& sketch = *it;

CHECK_GE(sketch.second, 1);
if (sketch.second > 1) {
sketch.first->Prune(p_ctx, common::detail::RequiredSampleCutsPerColumn(
p.max_bin, ext_info.accumulated_rows));
sketch.first->ShrinkToFit();
}
final_sketch.Merge(p_ctx, sketch.first->ColumnsPtr(), sketch.first->Data());
final_sketch.FixError();
}

sketches.clear();
sketches.shrink_to_fit();

final_sketch.MakeCuts(ctx, cuts.get(), info.IsColumnSplit());
} else {
GetCutsFromRef(ctx, ref, ext_info.n_features, p, cuts.get());
}
Expand Down
4 changes: 2 additions & 2 deletions src/data/sparse_page_source.h
Original file line number Diff line number Diff line change
Expand Up @@ -289,11 +289,11 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
auto page = std::make_shared<S>();
this->exce_.Run([&] {
std::unique_ptr<typename FormatStreamPolicy::FormatT> fmt{
this->CreatePageFormat(this->param_)};
self->CreatePageFormat(self->param_)};
auto name = self->cache_info_->ShardName();
auto [offset, length] = self->cache_info_->View(fetch_it);
std::unique_ptr<typename FormatStreamPolicy::ReaderT> fi{
this->CreateReader(name, offset, length)};
self->CreateReader(name, offset, length)};
CHECK(fmt->Read(page.get(), fi.get()));
});
return page;
Expand Down
6 changes: 3 additions & 3 deletions tests/cpp/common/test_hist_util.cu
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

#include "../../../include/xgboost/logging.h"
#include "../../../src/common/cuda_context.cuh"
#include "../../../src/common/cuda_rt_utils.h" // for SetDevice
#include "../../../src/common/device_helpers.cuh"
#include "../../../src/common/hist_util.cuh"
#include "../../../src/common/hist_util.h"
Expand Down Expand Up @@ -59,8 +60,7 @@ TEST(HistUtil, SketchBatchNumElements) {
GTEST_SKIP_("Test not runnable with RMM enabled.");
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
size_t constexpr kCols = 10000;
int device;
dh::safe_cuda(cudaGetDevice(&device));
std::int32_t device = dh::CurrentDevice();
auto avail = static_cast<size_t>(dh::AvailableMemory(device) * 0.8);
auto per_elem = detail::BytesPerElement(false);
auto avail_elem = avail / per_elem;
Expand Down Expand Up @@ -576,7 +576,7 @@ TEST(HistUtil, AdapterDeviceSketchBatches) {

namespace {
auto MakeData(Context const* ctx, std::size_t n_samples, bst_feature_t n_features) {
dh::safe_cuda(cudaSetDevice(ctx->Ordinal()));
common::SetDevice(ctx->Ordinal());
auto n = n_samples * n_features;
std::vector<float> x;
x.resize(n);
Expand Down
2 changes: 1 addition & 1 deletion tests/cpp/data/test_iterative_dmatrix.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

namespace xgboost::data {
void TestEquivalent(float sparsity) {
Context ctx{MakeCUDACtx(0)};
auto ctx = MakeCUDACtx(0);

CudaArrayIterForTest iter{sparsity};
IterativeDMatrix m(&iter, iter.Proxy(), nullptr, Reset, Next,
Expand Down

0 comments on commit ed5f33d

Please sign in to comment.