Skip to content

Commit

Permalink
[EM] Prevent init with CUDA malloc resource. (#10606)
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis authored Jul 20, 2024
1 parent 0846ad8 commit cb62f9e
Show file tree
Hide file tree
Showing 7 changed files with 105 additions and 41 deletions.
37 changes: 37 additions & 0 deletions src/common/cuda_rt_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@
*/
#pragma once
#include <cstdint> // for int32_t

#if defined(XGBOOST_USE_NVTX)
#include <nvtx3/nvtx3.hpp>
#endif // defined(XGBOOST_USE_NVTX)

namespace xgboost::common {
std::int32_t AllVisibleGPUs();

Expand All @@ -18,4 +23,36 @@ bool SupportsAts();
void CheckComputeCapability();

void SetDevice(std::int32_t device);

struct NvtxDomain {
static constexpr char const *name{"libxgboost"}; // NOLINT
};

#if defined(XGBOOST_USE_NVTX)
using NvtxScopedRange = ::nvtx3::scoped_range_in<NvtxDomain>;
using NvtxEventAttr = ::nvtx3::event_attributes;
using NvtxRgb = ::nvtx3::rgb;
#else
class NvtxScopedRange {
public:
template <typename... Args>
explicit NvtxScopedRange(Args &&...) {}
};
class NvtxEventAttr {
public:
template <typename... Args>
explicit NvtxEventAttr(Args &&...) {}
};
class NvtxRgb {
public:
template <typename... Args>
explicit NvtxRgb(Args &&...) {}
};
#endif // defined(XGBOOST_USE_NVTX)
} // namespace xgboost::common

#if defined(XGBOOST_USE_NVTX)
#define xgboost_NVTX_FN_RANGE() NVTX3_FUNC_RANGE_IN(::xgboost::common::NvtxDomain)
#else
#define xgboost_NVTX_FN_RANGE()
#endif // defined(XGBOOST_USE_NVTX)
11 changes: 9 additions & 2 deletions src/common/ref_resource_view.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,17 @@ namespace xgboost::common {
* @brief Make a fixed size `RefResourceView` with cudaMalloc resource.
*/
template <typename T>
[[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const* ctx,
std::size_t n_elements, T const& init) {
[[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const*,
std::size_t n_elements) {
auto resource = std::make_shared<common::CudaMallocResource>(n_elements * sizeof(T));
auto ref = RefResourceView{resource->DataAs<T>(), n_elements, resource};
return ref;
}

template <typename T>
[[nodiscard]] RefResourceView<T> MakeFixedVecWithCudaMalloc(Context const* ctx,
std::size_t n_elements, T const& init) {
auto ref = MakeFixedVecWithCudaMalloc<T>(ctx, n_elements);
thrust::fill_n(ctx->CUDACtx()->CTP(), ref.data(), ref.size(), init);
return ref;
}
Expand Down
6 changes: 2 additions & 4 deletions src/common/resource.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,9 @@ class CudaMallocResource : public ResourceHandler {
}
~CudaMallocResource() noexcept(true) override { this->Clear(); }

void* Data() override { return storage_.data(); }
[[nodiscard]] void* Data() override { return storage_.data(); }
[[nodiscard]] std::size_t Size() const override { return storage_.size(); }
void Resize(std::size_t n_bytes, std::byte init = std::byte{0}) {
this->storage_.resize(n_bytes, init);
}
void Resize(std::size_t n_bytes) { this->storage_.resize(n_bytes); }
};

class CudaMmapResource : public ResourceHandler {
Expand Down
29 changes: 14 additions & 15 deletions src/common/timer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@
#include <utility>

#include "../collective/communicator-inl.h"
#include "cuda_rt_utils.h"

#if defined(XGBOOST_USE_NVTX)
#include <nvtx3/nvToolsExt.h>
#include <nvtx3/nvtx3.hpp>
#endif // defined(XGBOOST_USE_NVTX)

namespace xgboost::common {
Expand All @@ -17,8 +18,8 @@ void Monitor::Start(std::string const &name) {
auto &stats = statistics_map_[name];
stats.timer.Start();
#if defined(XGBOOST_USE_NVTX)
std::string nvtx_name = "xgboost::" + label_ + "::" + name;
stats.nvtx_id = nvtxRangeStartA(nvtx_name.c_str());
auto range_handle = nvtx3::start_range_in<common::NvtxDomain>(label_ + "::" + name);
stats.nvtx_id = range_handle.get_value();
#endif // defined(XGBOOST_USE_NVTX)
}
}
Expand All @@ -29,34 +30,32 @@ void Monitor::Stop(const std::string &name) {
stats.timer.Stop();
stats.count++;
#if defined(XGBOOST_USE_NVTX)
nvtxRangeEnd(stats.nvtx_id);
nvtx3::end_range_in<common::NvtxDomain>(nvtx3::range_handle{stats.nvtx_id});
#endif // defined(XGBOOST_USE_NVTX)
}
}

void Monitor::PrintStatistics(StatMap const& statistics) const {
void Monitor::PrintStatistics(StatMap const &statistics) const {
for (auto &kv : statistics) {
if (kv.second.first == 0) {
LOG(WARNING) <<
"Timer for " << kv.first << " did not get stopped properly.";
LOG(WARNING) << "Timer for " << kv.first << " did not get stopped properly.";
continue;
}
LOG(CONSOLE) << kv.first << ": " << static_cast<double>(kv.second.second) / 1e+6
<< "s, " << kv.second.first << " calls @ "
<< kv.second.second
<< "us" << std::endl;
LOG(CONSOLE) << kv.first << ": " << static_cast<double>(kv.second.second) / 1e+6 << "s, "
<< kv.second.first << " calls @ " << kv.second.second << "us" << std::endl;
}
}

void Monitor::Print() const {
if (!ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) { return; }
if (!ConsoleLogger::ShouldLog(ConsoleLogger::LV::kDebug)) {
return;
}
auto rank = collective::GetRank();
StatMap stat_map;
for (auto const &kv : statistics_map_) {
stat_map[kv.first] = std::make_pair(
kv.second.count, std::chrono::duration_cast<std::chrono::microseconds>(
kv.second.timer.elapsed)
.count());
kv.second.count,
std::chrono::duration_cast<std::chrono::microseconds>(kv.second.timer.elapsed).count());
}
if (stat_map.empty()) {
return;
Expand Down
2 changes: 1 addition & 1 deletion src/data/ellpack_page.cu
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ size_t EllpackPageImpl::Copy(Context const* ctx, EllpackPageImpl const* page, bs
LOG(FATAL) << "Concatenating the same Ellpack.";
return this->n_rows * this->row_stride;
}
dh::LaunchN(num_elements, CopyPage{this, page, offset});
dh::LaunchN(num_elements, ctx->CUDACtx()->Stream(), CopyPage{this, page, offset});
monitor_.Stop(__func__);
return num_elements;
}
Expand Down
60 changes: 41 additions & 19 deletions src/data/ellpack_page_raw_format.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <cstddef> // for size_t
#include <vector> // for vector

#include "../common/cuda_rt_utils.h"
#include "../common/io.h" // for AlignedResourceReadStream, AlignedFileWriteStream
#include "../common/ref_resource_view.cuh" // for MakeFixedVecWithCudaMalloc
#include "../common/ref_resource_view.h" // for ReadVec, WriteVec
Expand All @@ -21,6 +22,8 @@ namespace {
template <typename T>
[[nodiscard]] bool ReadDeviceVec(common::AlignedResourceReadStream* fi,
common::RefResourceView<T>* vec) {
xgboost_NVTX_FN_RANGE();

std::uint64_t n{0};
if (!fi->Read(&n)) {
return false;
Expand All @@ -37,7 +40,7 @@ template <typename T>
}

auto ctx = Context{}.MakeCUDA(common::CurrentDevice());
*vec = common::MakeFixedVecWithCudaMalloc(&ctx, n, static_cast<T>(0));
*vec = common::MakeFixedVecWithCudaMalloc<T>(&ctx, n);
dh::safe_cuda(cudaMemcpyAsync(vec->data(), ptr, n_bytes, cudaMemcpyDefault, dh::DefaultStream()));
return true;
}
Expand All @@ -50,6 +53,7 @@ template <typename T>

[[nodiscard]] bool EllpackPageRawFormat::Read(EllpackPage* page,
common::AlignedResourceReadStream* fi) {
xgboost_NVTX_FN_RANGE();
auto* impl = page->Impl();

impl->SetCuts(this->cuts_);
Expand All @@ -69,6 +73,8 @@ template <typename T>

[[nodiscard]] std::size_t EllpackPageRawFormat::Write(const EllpackPage& page,
common::AlignedFileWriteStream* fo) {
xgboost_NVTX_FN_RANGE();

std::size_t bytes{0};
auto* impl = page.Impl();
bytes += fo->Write(impl->n_rows);
Expand All @@ -84,22 +90,30 @@ template <typename T>
}

[[nodiscard]] bool EllpackPageRawFormat::Read(EllpackPage* page, EllpackHostCacheStream* fi) const {
xgboost_NVTX_FN_RANGE();

auto* impl = page->Impl();
CHECK(this->cuts_->cut_values_.DeviceCanRead());
impl->SetCuts(this->cuts_);
RET_IF_NOT(fi->Read(&impl->n_rows));
RET_IF_NOT(fi->Read(&impl->is_dense));
RET_IF_NOT(fi->Read(&impl->row_stride));

// Read vec
// Read vector
Context ctx = Context{}.MakeCUDA(common::CurrentDevice());
bst_idx_t n{0};
RET_IF_NOT(fi->Read(&n));
if (n != 0) {
impl->gidx_buffer =
common::MakeFixedVecWithCudaMalloc(&ctx, n, static_cast<common::CompressedByteT>(0));
auto read_vec = [&] {
common::NvtxScopedRange range{common::NvtxEventAttr{"read-vec", common::NvtxRgb{127, 255, 0}}};
bst_idx_t n{0};
RET_IF_NOT(fi->Read(&n));
if (n == 0) {
return true;
}
impl->gidx_buffer = common::MakeFixedVecWithCudaMalloc<common::CompressedByteT>(&ctx, n);
RET_IF_NOT(fi->Read(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes()));
}
return true;
};
RET_IF_NOT(read_vec());

RET_IF_NOT(fi->Read(&impl->n_rows));
RET_IF_NOT(fi->Read(&impl->is_dense));
RET_IF_NOT(fi->Read(&impl->row_stride));
RET_IF_NOT(fi->Read(&impl->base_rowid));

dh::DefaultStream().Sync();
Expand All @@ -108,19 +122,27 @@ template <typename T>

[[nodiscard]] std::size_t EllpackPageRawFormat::Write(const EllpackPage& page,
EllpackHostCacheStream* fo) const {
xgboost_NVTX_FN_RANGE();

bst_idx_t bytes{0};
auto* impl = page.Impl();
bytes += fo->Write(impl->n_rows);
bytes += fo->Write(impl->is_dense);
bytes += fo->Write(impl->row_stride);

// Write vector
bst_idx_t n = impl->gidx_buffer.size();
bytes += fo->Write(n);
auto write_vec = [&] {
common::NvtxScopedRange range{common::NvtxEventAttr{"write-vec", common::NvtxRgb{127, 255, 0}}};
bst_idx_t n = impl->gidx_buffer.size();
bytes += fo->Write(n);

if (!impl->gidx_buffer.empty()) {
bytes += fo->Write(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes());
}
if (!impl->gidx_buffer.empty()) {
bytes += fo->Write(impl->gidx_buffer.data(), impl->gidx_buffer.size_bytes());
}
};

write_vec();

bytes += fo->Write(impl->n_rows);
bytes += fo->Write(impl->is_dense);
bytes += fo->Write(impl->row_stride);
bytes += fo->Write(impl->base_rowid);

dh::DefaultStream().Sync();
Expand Down
1 change: 1 addition & 0 deletions tests/ci_build/conda_env/macos_cpu_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,5 @@ dependencies:
- pyspark>=3.4.0
- cloudpickle
- pip:
- setuptools
- sphinx_rtd_theme

0 comments on commit cb62f9e

Please sign in to comment.