Skip to content

Commit

Permalink
numa.
Browse files Browse the repository at this point in the history
  • Loading branch information
trivialfis committed Sep 26, 2024
1 parent 5884680 commit d6ed3a0
Show file tree
Hide file tree
Showing 7 changed files with 114 additions and 43 deletions.
34 changes: 22 additions & 12 deletions src/common/cuda_dr_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@
#if defined(XGBOOST_USE_CUDA)
#include "cuda_dr_utils.h"

#include <memory> // for make_unique
#include <mutex> // for call_once
#include <cstring> // for memset
#include <memory> // for make_unique
#include <mutex> // for call_once

#include "common.h" // for safe_cuda
#include "xgboost/string_view.h" // for StringView
#include "cuda_rt_utils.h" // for CurrentDevice
#include "threading_utils.h" // for GetCpuNuma
#include "xgboost/string_view.h" // for StringVie

namespace xgboost::cudr {
CuDriverApi::CuDriverApi() {
Expand Down Expand Up @@ -70,16 +73,23 @@ CuDriverApi &GetGlobalCuDriverApi() {
std::call_once(flag, [&] { cu = std::make_unique<CuDriverApi>(); });
return *cu;
}
} // namespace xgboost::cudr

#else

#include "common.h"

namespace xgboost::cudr {
CuDriverApi &GetGlobalCuDriverApi() {
common::AssertGPUSupport();
return {};
CUmemAllocationProp MakeAllocProp(CUmemLocationType type) {
CUmemAllocationProp prop;
std::memset(&prop, '\0', sizeof(prop));
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = type;
if (type == CU_MEM_LOCATION_TYPE_DEVICE) {
prop.location.id = curt::CurrentDevice();
} else {
unsigned cpu{0}, numa{0};
bool status = common::GetCpuNuma(&cpu, &numa);
if (!status) {
numa = 0; // Use 0 as the default.
}
prop.location.id = numa;
}
return prop;
}
} // namespace xgboost::cudr
#endif
3 changes: 3 additions & 0 deletions src/common/cuda_dr_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,4 +80,7 @@ inline auto GetAllocGranularity(CUmemAllocationProp const *prop) {
&granularity, prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
return granularity;
}

// Describe the allocation property
CUmemAllocationProp MakeAllocProp(CUmemLocationType type);
} // namespace xgboost::cudr
6 changes: 3 additions & 3 deletions src/common/cuda_rt_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ std::int32_t AllVisibleGPUs() {
// When compiled with CUDA but running on CPU only device,
// cudaGetDeviceCount will fail.
dh::safe_cuda(cudaGetDeviceCount(&n_visgpus));
} catch (const dmlc::Error &) {
} catch (const dmlc::Error&) {
cudaGetLastError(); // reset error.
return 0;
}
Expand Down Expand Up @@ -81,7 +81,7 @@ void RtVersion(std::int32_t* major, std::int32_t* minor) {
std::int32_t AllVisibleGPUs() { return 0; }

std::int32_t CurrentDevice() {
AssertGPUSupport();
common::AssertGPUSupport();
return -1;
}

Expand All @@ -93,7 +93,7 @@ void CheckComputeCapability() {}

void SetDevice(std::int32_t device) {
if (device >= 0) {
AssertGPUSupport();
common::AssertGPUSupport();
}
}
#endif // !defined(XGBOOST_USE_CUDA)
Expand Down
14 changes: 4 additions & 10 deletions src/common/device_vector.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -276,16 +276,10 @@ class GrowOnlyVirtualMemVec {
public:
using value_type = std::int32_t; // NOLINT

explicit GrowOnlyVirtualMemVec(CUmemLocationType type) {
CHECK(type == CU_MEM_LOCATION_TYPE_DEVICE || type == CU_MEM_LOCATION_TYPE_HOST_NUMA);

// Describe the allocation property
std::memset(&this->prop_, '\0', sizeof(this->prop_));
this->prop_.type = CU_MEM_ALLOCATION_TYPE_PINNED;
this->prop_.location.type = type;
// FIXME: Assume without numa
this->prop_.location.id = type == CU_MEM_LOCATION_TYPE_DEVICE ? cub::CurrentDevice() : 0;

explicit GrowOnlyVirtualMemVec(CUmemLocationType type)
: prop_{xgboost::cudr::MakeAllocProp(type)} {
CHECK(type == CU_MEM_LOCATION_TYPE_DEVICE || type == CU_MEM_LOCATION_TYPE_HOST_NUMA ||
type == CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT);
// Get the allocation granularity.
this->granularity_ = xgboost::cudr::GetAllocGranularity(&this->prop_);

Expand Down
12 changes: 11 additions & 1 deletion src/common/threading_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@
#include <fstream> // for ifstream
#include <string> // for string

#include "common.h" // for DivRoundUp
#include "common.h" // for DivRoundUp

#if defined(__linux__)
#include <pthread.h>
#include <sys/syscall.h> // for SYS_getcpu
#include <unistd.h> // for syscall
#endif

namespace xgboost::common {
Expand Down Expand Up @@ -118,6 +120,14 @@ std::int32_t OmpGetNumThreads(std::int32_t n_threads) {
return n_threads;
}

[[nodiscard]] bool GetCpuNuma(unsigned int* cpu, unsigned int* numa) {
#ifdef SYS_getcpu
return syscall(SYS_getcpu, cpu, numa, NULL) == 0;
#else
return false;
#endif
}

void NameThread(std::thread* t, StringView name) {
#if defined(__linux__)
auto handle = t->native_handle();
Expand Down
8 changes: 7 additions & 1 deletion src/common/threading_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -306,10 +306,16 @@ class MemStackAllocator {
};

/**
* \brief Constant that can be used for initializing static thread local memory.
* @brief Constant that can be used for initializing static thread local memory.
*/
std::int32_t constexpr DefaultMaxThreads() { return 128; }

/**
* @brief Get numa node on Linux. Other platforms are not supported. Returns false if the
* call fails.
*/
[[nodiscard]] bool GetCpuNuma(unsigned int* cpu, unsigned int* numa);

/**
* @brief Give the thread a name. Supports only pthread on linux.
*/
Expand Down
80 changes: 64 additions & 16 deletions tests/cpp/common/test_device_vector.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,15 @@
* Copyright 2024, XGBoost Contributors
*/
#include <gtest/gtest.h>
#include <linux/sched.h>
#include <sched.h>


#include <numeric> // for iota
#include <thrust/detail/sequence.inl> // for sequence

#include "../../../src/common/device_vector.cuh"
#include "xgboost/global_config.h" // for GlobalConfigThreadLocalStore
#include "xgboost/string_view.h"

namespace dh {
TEST(DeviceUVector, Basic) {
Expand All @@ -20,22 +25,65 @@ TEST(DeviceUVector, Basic) {
std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity);
}

namespace {
class TestVirtualMem : public ::testing::TestWithParam<CUmemLocationType> {
public:
void Run() {
auto type = this->GetParam();
detail::GrowOnlyVirtualMemVec vec{type};
auto prop = xgboost::cudr::MakeAllocProp(type);
auto gran = xgboost::cudr::GetAllocGranularity(&prop);
ASSERT_GE(gran, 2);
auto data = vec.GetSpan<std::int32_t>(32); // should be smaller than granularity
ASSERT_EQ(data.size(), 32);
static_assert(std::is_same_v<typename decltype(data)::value_type, std::int32_t>);

std::vector<std::int32_t> h_data(data.size());
auto check = [&] {
for (std::size_t i = 0; i < h_data.size(); ++i) {
ASSERT_EQ(h_data[i], i);
}
};
auto fill = [&](std::int32_t n_orig, xgboost::common::Span<std::int32_t> data) {
if (type == CU_MEM_LOCATION_TYPE_DEVICE) {
thrust::sequence(thrust::cuda::par_nosync, data.data() + n_orig, data.data() + data.size(),
n_orig);
dh::safe_cuda(cudaMemcpy(h_data.data(), data.data(), data.size_bytes(), cudaMemcpyDefault));
} else {
std::iota(data.data() + n_orig, data.data() + data.size(), n_orig);
std::copy_n(data.data(), data.size(), h_data.data());
}
};

fill(0, data);
check();

TEST(VirtualMem, AllocDevice) {
for (std::size_t i = 0; i < 10000; ++i) {
detail::GrowOnlyVirtualMemVec vec{CU_MEM_LOCATION_TYPE_DEVICE};
vec.GrowTo(1024 * sizeof(double));
vec.GrowTo((4096 + 2097152) * sizeof(double));
auto data = vec.data();
dh::safe_cuda(cudaMemset(data, vec.size(), '\0'));
auto n_orig = data.size();
// Should be greater than granularity since we are using i32.
data = vec.GetSpan<std::int32_t>(gran);
h_data.resize(data.size());
fill(n_orig, data);

check();
}
}
};
} // anonymous namespace

TEST(VirtualMem, AllocHost) {
detail::GrowOnlyVirtualMemVec vec{CU_MEM_LOCATION_TYPE_HOST_NUMA};
vec.GrowTo(1024 * sizeof(double));
vec.GrowTo((1 << 22) * sizeof(double));
char *data = reinterpret_cast<char *>(vec.data());
std::fill(data, data + vec.size(), 0);
}
TEST_P(TestVirtualMem, Alloc) { this->Run(); }

INSTANTIATE_TEST_SUITE_P(
Basic, TestVirtualMem,
::testing::Values(CU_MEM_LOCATION_TYPE_DEVICE, CU_MEM_LOCATION_TYPE_HOST_NUMA),
[](::testing::TestParamInfo<TestVirtualMem::ParamType> const& info) -> char const* {
auto type = info.param;
switch (type) {
case CU_MEM_LOCATION_TYPE_DEVICE:
return "device";
case CU_MEM_LOCATION_TYPE_HOST_NUMA:
return "host_numa";
default:
LOG(FATAL) << "unreachable";
}
return nullptr;
});
} // namespace dh

0 comments on commit d6ed3a0

Please sign in to comment.