From d6ed3a006025685f6abe726887dadc690bf5d7f3 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Thu, 26 Sep 2024 16:13:53 +0800 Subject: [PATCH] numa. --- src/common/cuda_dr_utils.cc | 34 +++++++---- src/common/cuda_dr_utils.h | 3 + src/common/cuda_rt_utils.cc | 6 +- src/common/device_vector.cuh | 14 ++--- src/common/threading_utils.cc | 12 +++- src/common/threading_utils.h | 8 ++- tests/cpp/common/test_device_vector.cu | 80 ++++++++++++++++++++------ 7 files changed, 114 insertions(+), 43 deletions(-) diff --git a/src/common/cuda_dr_utils.cc b/src/common/cuda_dr_utils.cc index 6abdcc5314c4..00a10d3652c5 100644 --- a/src/common/cuda_dr_utils.cc +++ b/src/common/cuda_dr_utils.cc @@ -4,11 +4,14 @@ #if defined(XGBOOST_USE_CUDA) #include "cuda_dr_utils.h" -#include // for make_unique -#include // for call_once +#include // for memset +#include // for make_unique +#include // for call_once #include "common.h" // for safe_cuda -#include "xgboost/string_view.h" // for StringView +#include "cuda_rt_utils.h" // for CurrentDevice +#include "threading_utils.h" // for GetCpuNuma +#include "xgboost/string_view.h" // for StringVie namespace xgboost::cudr { CuDriverApi::CuDriverApi() { @@ -70,16 +73,23 @@ CuDriverApi &GetGlobalCuDriverApi() { std::call_once(flag, [&] { cu = std::make_unique(); }); return *cu; } -} // namespace xgboost::cudr - -#else - -#include "common.h" -namespace xgboost::cudr { -CuDriverApi &GetGlobalCuDriverApi() { - common::AssertGPUSupport(); - return {}; +CUmemAllocationProp MakeAllocProp(CUmemLocationType type) { + CUmemAllocationProp prop; + std::memset(&prop, '\0', sizeof(prop)); + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = type; + if (type == CU_MEM_LOCATION_TYPE_DEVICE) { + prop.location.id = curt::CurrentDevice(); + } else { + unsigned cpu{0}, numa{0}; + bool status = common::GetCpuNuma(&cpu, &numa); + if (!status) { + numa = 0; // Use 0 as the default. + } + prop.location.id = numa; + } + return prop; } } // namespace xgboost::cudr #endif diff --git a/src/common/cuda_dr_utils.h b/src/common/cuda_dr_utils.h index ec91f8da57cd..ae0effc16285 100644 --- a/src/common/cuda_dr_utils.h +++ b/src/common/cuda_dr_utils.h @@ -80,4 +80,7 @@ inline auto GetAllocGranularity(CUmemAllocationProp const *prop) { &granularity, prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); return granularity; } + +// Describe the allocation property +CUmemAllocationProp MakeAllocProp(CUmemLocationType type); } // namespace xgboost::cudr diff --git a/src/common/cuda_rt_utils.cc b/src/common/cuda_rt_utils.cc index 0dfc4e4d8903..cf16a431a630 100644 --- a/src/common/cuda_rt_utils.cc +++ b/src/common/cuda_rt_utils.cc @@ -20,7 +20,7 @@ std::int32_t AllVisibleGPUs() { // When compiled with CUDA but running on CPU only device, // cudaGetDeviceCount will fail. dh::safe_cuda(cudaGetDeviceCount(&n_visgpus)); - } catch (const dmlc::Error &) { + } catch (const dmlc::Error&) { cudaGetLastError(); // reset error. return 0; } @@ -81,7 +81,7 @@ void RtVersion(std::int32_t* major, std::int32_t* minor) { std::int32_t AllVisibleGPUs() { return 0; } std::int32_t CurrentDevice() { - AssertGPUSupport(); + common::AssertGPUSupport(); return -1; } @@ -93,7 +93,7 @@ void CheckComputeCapability() {} void SetDevice(std::int32_t device) { if (device >= 0) { - AssertGPUSupport(); + common::AssertGPUSupport(); } } #endif // !defined(XGBOOST_USE_CUDA) diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh index b1c0a1ca5383..8ded3e721d6f 100644 --- a/src/common/device_vector.cuh +++ b/src/common/device_vector.cuh @@ -276,16 +276,10 @@ class GrowOnlyVirtualMemVec { public: using value_type = std::int32_t; // NOLINT - explicit GrowOnlyVirtualMemVec(CUmemLocationType type) { - CHECK(type == CU_MEM_LOCATION_TYPE_DEVICE || type == CU_MEM_LOCATION_TYPE_HOST_NUMA); - - // Describe the allocation property - std::memset(&this->prop_, '\0', sizeof(this->prop_)); - this->prop_.type = CU_MEM_ALLOCATION_TYPE_PINNED; - this->prop_.location.type = type; - // FIXME: Assume without numa - this->prop_.location.id = type == CU_MEM_LOCATION_TYPE_DEVICE ? cub::CurrentDevice() : 0; - + explicit GrowOnlyVirtualMemVec(CUmemLocationType type) + : prop_{xgboost::cudr::MakeAllocProp(type)} { + CHECK(type == CU_MEM_LOCATION_TYPE_DEVICE || type == CU_MEM_LOCATION_TYPE_HOST_NUMA || + type == CU_MEM_LOCATION_TYPE_HOST_NUMA_CURRENT); // Get the allocation granularity. this->granularity_ = xgboost::cudr::GetAllocGranularity(&this->prop_); diff --git a/src/common/threading_utils.cc b/src/common/threading_utils.cc index f7296b7f9f3c..0d943f94f9c6 100644 --- a/src/common/threading_utils.cc +++ b/src/common/threading_utils.cc @@ -9,10 +9,12 @@ #include // for ifstream #include // for string -#include "common.h" // for DivRoundUp +#include "common.h" // for DivRoundUp #if defined(__linux__) #include +#include // for SYS_getcpu +#include // for syscall #endif namespace xgboost::common { @@ -118,6 +120,14 @@ std::int32_t OmpGetNumThreads(std::int32_t n_threads) { return n_threads; } +[[nodiscard]] bool GetCpuNuma(unsigned int* cpu, unsigned int* numa) { +#ifdef SYS_getcpu + return syscall(SYS_getcpu, cpu, numa, NULL) == 0; +#else + return false; +#endif +} + void NameThread(std::thread* t, StringView name) { #if defined(__linux__) auto handle = t->native_handle(); diff --git a/src/common/threading_utils.h b/src/common/threading_utils.h index e21400705f79..a4e2f21e4954 100644 --- a/src/common/threading_utils.h +++ b/src/common/threading_utils.h @@ -306,10 +306,16 @@ class MemStackAllocator { }; /** - * \brief Constant that can be used for initializing static thread local memory. + * @brief Constant that can be used for initializing static thread local memory. */ std::int32_t constexpr DefaultMaxThreads() { return 128; } +/** + * @brief Get numa node on Linux. Other platforms are not supported. Returns false if the + * call fails. + */ +[[nodiscard]] bool GetCpuNuma(unsigned int* cpu, unsigned int* numa); + /** * @brief Give the thread a name. Supports only pthread on linux. */ diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu index f6507cbfc96e..68b4fcbd74a5 100644 --- a/tests/cpp/common/test_device_vector.cu +++ b/tests/cpp/common/test_device_vector.cu @@ -2,10 +2,15 @@ * Copyright 2024, XGBoost Contributors */ #include +#include +#include + + +#include // for iota +#include // for sequence #include "../../../src/common/device_vector.cuh" #include "xgboost/global_config.h" // for GlobalConfigThreadLocalStore -#include "xgboost/string_view.h" namespace dh { TEST(DeviceUVector, Basic) { @@ -20,22 +25,65 @@ TEST(DeviceUVector, Basic) { std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity); } +namespace { +class TestVirtualMem : public ::testing::TestWithParam { + public: + void Run() { + auto type = this->GetParam(); + detail::GrowOnlyVirtualMemVec vec{type}; + auto prop = xgboost::cudr::MakeAllocProp(type); + auto gran = xgboost::cudr::GetAllocGranularity(&prop); + ASSERT_GE(gran, 2); + auto data = vec.GetSpan(32); // should be smaller than granularity + ASSERT_EQ(data.size(), 32); + static_assert(std::is_same_v); + + std::vector h_data(data.size()); + auto check = [&] { + for (std::size_t i = 0; i < h_data.size(); ++i) { + ASSERT_EQ(h_data[i], i); + } + }; + auto fill = [&](std::int32_t n_orig, xgboost::common::Span data) { + if (type == CU_MEM_LOCATION_TYPE_DEVICE) { + thrust::sequence(thrust::cuda::par_nosync, data.data() + n_orig, data.data() + data.size(), + n_orig); + dh::safe_cuda(cudaMemcpy(h_data.data(), data.data(), data.size_bytes(), cudaMemcpyDefault)); + } else { + std::iota(data.data() + n_orig, data.data() + data.size(), n_orig); + std::copy_n(data.data(), data.size(), h_data.data()); + } + }; + + fill(0, data); + check(); -TEST(VirtualMem, AllocDevice) { - for (std::size_t i = 0; i < 10000; ++i) { - detail::GrowOnlyVirtualMemVec vec{CU_MEM_LOCATION_TYPE_DEVICE}; - vec.GrowTo(1024 * sizeof(double)); - vec.GrowTo((4096 + 2097152) * sizeof(double)); - auto data = vec.data(); - dh::safe_cuda(cudaMemset(data, vec.size(), '\0')); + auto n_orig = data.size(); + // Should be greater than granularity since we are using i32. + data = vec.GetSpan(gran); + h_data.resize(data.size()); + fill(n_orig, data); + + check(); } -} +}; +} // anonymous namespace -TEST(VirtualMem, AllocHost) { - detail::GrowOnlyVirtualMemVec vec{CU_MEM_LOCATION_TYPE_HOST_NUMA}; - vec.GrowTo(1024 * sizeof(double)); - vec.GrowTo((1 << 22) * sizeof(double)); - char *data = reinterpret_cast(vec.data()); - std::fill(data, data + vec.size(), 0); -} +TEST_P(TestVirtualMem, Alloc) { this->Run(); } + +INSTANTIATE_TEST_SUITE_P( + Basic, TestVirtualMem, + ::testing::Values(CU_MEM_LOCATION_TYPE_DEVICE, CU_MEM_LOCATION_TYPE_HOST_NUMA), + [](::testing::TestParamInfo const& info) -> char const* { + auto type = info.param; + switch (type) { + case CU_MEM_LOCATION_TYPE_DEVICE: + return "device"; + case CU_MEM_LOCATION_TYPE_HOST_NUMA: + return "host_numa"; + default: + LOG(FATAL) << "unreachable"; + } + return nullptr; + }); } // namespace dh