diff --git a/CMakeLists.txt b/CMakeLists.txt index 22fe4a3eb977..f5bb8855d859 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -38,6 +38,12 @@ elseif(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") endif() endif() +include(${xgboost_SOURCE_DIR}/cmake/CheckSVEsupport.cmake) +check_xgboost_sve_support() +if(XGBOOST_COMPILER_HAS_ARM_SVE) + add_compile_definitions(XGBOOST_SVE_COMPILER_SUPPORT) +endif() + include(${xgboost_SOURCE_DIR}/cmake/PrefetchIntrinsics.cmake) find_prefetch_intrinsics() include(${xgboost_SOURCE_DIR}/cmake/Version.cmake) diff --git a/cmake/CheckSVEsupport.cmake b/cmake/CheckSVEsupport.cmake new file mode 100644 index 000000000000..3abc19e6b1b2 --- /dev/null +++ b/cmake/CheckSVEsupport.cmake @@ -0,0 +1,32 @@ +function(check_xgboost_sve_support) +if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + include(CheckCSourceCompiles) + + # Save the original C_FLAGS to restore later + set(ORIGINAL_C_FLAGS "${CMAKE_C_FLAGS}") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+sve") + + # Check if the compiler supports ARM SVE + check_c_source_compiles(" + #if defined(__aarch64__) && defined(__ARM_FEATURE_SVE) + #include + int main() { + svfloat64_t a; + a = svdup_n_f64(0); + return 0; + } + #endif + " XGBOOST_COMPILER_HAS_ARM_SVE) + + if(XGBOOST_COMPILER_HAS_ARM_SVE) + message(STATUS "ARM SVE compiler support detected") + else() + message(STATUS "ARM SVE compiler support not detected") + endif() + + # Restore the original C_FLAGS + set(CMAKE_C_FLAGS "${ORIGINAL_C_FLAGS}") +else() + message(STATUS "Not an aarch64 architecture") +endif() +endfunction() \ No newline at end of file diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc index dfd80cb68c13..149c46f25651 100644 --- a/src/common/hist_util.cc +++ b/src/common/hist_util.cc @@ -1,5 +1,6 @@ /** * Copyright 2017-2023 by XGBoost Contributors + * Copyright 2024 FUJITSU LIMITED * \file hist_util.cc */ #include "hist_util.h" @@ -15,19 +16,28 @@ #include "xgboost/context.h" // for Context #include "xgboost/data.h" // for SparsePage, SortedCSCPage +#ifdef __linux__ +#include +#define PR_SVE_GET_VL 51 +#endif + +#ifdef XGBOOST_SVE_COMPILER_SUPPORT +#include // to leverage sve intrinsics +#endif + #if defined(XGBOOST_MM_PREFETCH_PRESENT) - #include - #define PREFETCH_READ_T0(addr) _mm_prefetch(reinterpret_cast(addr), _MM_HINT_T0) +#include +#define PREFETCH_READ_T0(addr) _mm_prefetch(reinterpret_cast(addr), _MM_HINT_T0) #elif defined(XGBOOST_BUILTIN_PREFETCH_PRESENT) - #define PREFETCH_READ_T0(addr) __builtin_prefetch(reinterpret_cast(addr), 0, 3) +#define PREFETCH_READ_T0(addr) __builtin_prefetch(reinterpret_cast(addr), 0, 3) #else // no SW pre-fetching available; PREFETCH_READ_T0 is no-op - #define PREFETCH_READ_T0(addr) do {} while (0) +#define PREFETCH_READ_T0(addr) \ + do { \ + } while (0) #endif // defined(XGBOOST_MM_PREFETCH_PRESENT) namespace xgboost::common { -HistogramCuts::HistogramCuts() { - cut_ptrs_.HostVector().emplace_back(0); -} +HistogramCuts::HistogramCuts() { cut_ptrs_.HostVector().emplace_back(0); } HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins, bool use_sorted, Span hessian) { @@ -53,10 +63,7 @@ HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins } container.MakeCuts(ctx, m->Info(), &out); } else { - SortedSketchContainer container{ctx, - max_bins, - m->Info().feature_types.ConstHostSpan(), - reduced, + SortedSketchContainer container{ctx, max_bins, m->Info().feature_types.ConstHostSpan(), reduced, HostSketchContainer::UseGroup(info)}; for (auto const &page : m->GetBatches(ctx)) { container.PushColPage(page, info, hessian); @@ -96,9 +103,9 @@ void CopyHist(GHistRow dst, const GHistRow src, size_t begin, size_t end) { */ void SubtractionHist(GHistRow dst, const GHistRow src1, const GHistRow src2, size_t begin, size_t end) { - double* pdst = reinterpret_cast(dst.data()); - const double* psrc1 = reinterpret_cast(src1.data()); - const double* psrc2 = reinterpret_cast(src2.data()); + double *pdst = reinterpret_cast(dst.data()); + const double *psrc1 = reinterpret_cast(src1.data()); + const double *psrc2 = reinterpret_cast(src2.data()); for (size_t i = 2 * begin; i < 2 * end; ++i) { pdst[i] = psrc1[i] - psrc2[i]; @@ -112,13 +119,10 @@ struct Prefetch { private: static constexpr size_t kNoPrefetchSize = - kPrefetchOffset + kCacheLineSize / - sizeof(decltype(GHistIndexMatrix::row_ptr)::value_type); + kPrefetchOffset + kCacheLineSize / sizeof(decltype(GHistIndexMatrix::row_ptr)::value_type); public: - static size_t NoPrefetchSize(size_t rows) { - return std::min(rows, kNoPrefetchSize); - } + static size_t NoPrefetchSize(size_t rows) { return std::min(rows, kNoPrefetchSize); } template static constexpr size_t GetPrefetchStep() { @@ -134,9 +138,7 @@ struct RuntimeFlags { const BinTypeSize bin_type_size; }; -template class GHistBuildingManager { public: @@ -170,7 +172,7 @@ class GHistBuildingManager { * and forward the call there. */ template - static void DispatchAndExecute(const RuntimeFlags& flags, Fn&& fn) { + static void DispatchAndExecute(const RuntimeFlags &flags, Fn &&fn) { if (flags.first_page != kFirstPage) { SetFirstPage::Type::DispatchAndExecute(flags, std::forward(fn)); } else if (flags.read_by_column != kReadByColumn) { @@ -186,6 +188,81 @@ class GHistBuildingManager { } }; +#ifdef XGBOOST_SVE_COMPILER_SUPPORT +template +__attribute__((target("arch=armv8-a+sve"))) +inline void UpdateHistogramWithSVE(size_t row_size, const BinIdxType *gr_index_local, + const std::uint32_t *offsets, double *hist_data, + const float *p_gpair, size_t idx_gh, const uint32_t two, + bool kAnyMissing) { + // Load the gradient and hessian values from p_gpair into SVE vector registers + svfloat64_t grad = svdup_n_f64(p_gpair[idx_gh]); + svfloat64_t hess = svdup_n_f64(p_gpair[idx_gh + 1]); + + for (size_t j = 0; j < row_size; j += svcntw()) { + // Create a predicate (mask) for 32-bit & 64-bit elements, active only for valid elements + svbool_t pg32 = svwhilelt_b32(j, row_size); + svbool_t pg64_lower = svwhilelt_b64(j, row_size); + svbool_t pg64_upper = svwhilelt_b64(j+svcntd(), row_size); + + // Load the gradient index values and offsets for the current chunk of the row + svuint32_t gr_index_vec = + svld1ub_u32(pg32, reinterpret_cast(&gr_index_local[j])); + svuint32_t offsets_vec = svld1(pg32, &offsets[j]); + + svuint32_t idx_bin_vec; + if (kAnyMissing) { + idx_bin_vec = svmul_n_u32_x(pg32, gr_index_vec, two); + } else { + svuint32_t temp = svadd_u32_m(pg32, gr_index_vec, offsets_vec); + idx_bin_vec = svmul_n_u32_x(pg32, temp, two); + } + + // Unpack 32-bit index binary vector into 64-bit vectors from lower & upper half respectively + svuint64_t idx_bin_vec0_0 = svunpklo_u64(idx_bin_vec); + svuint64_t idx_bin_vec0_1 = svunpkhi_u64(idx_bin_vec); + + // Increment the indices by 1 for hessian. + svuint64_t idx_bin_vec1_0 = svadd_n_u64_m(pg64_lower, idx_bin_vec0_0, 1); + svuint64_t idx_bin_vec1_1 = svadd_n_u64_m(pg64_upper, idx_bin_vec0_1, 1); + + // Gather the histogram data corresponding to the computed indices + svfloat64_t hist0_vec0 = svld1_gather_index(pg64_lower, hist_data, idx_bin_vec0_0); + svfloat64_t hist0_vec1 = svld1_gather_index(pg64_upper, hist_data, idx_bin_vec0_1); + svfloat64_t hist1_vec0 = svld1_gather_index(pg64_lower, hist_data, idx_bin_vec1_0); + svfloat64_t hist1_vec1 = svld1_gather_index(pg64_upper, hist_data, idx_bin_vec1_1); + + // Accumulate the gradient and hessian values into the histogram + hist0_vec0 = svadd_f64_m(pg64_lower, hist0_vec0, grad); + hist0_vec1 = svadd_f64_m(pg64_upper, hist0_vec1, grad); + hist1_vec0 = svadd_f64_m(pg64_lower, hist1_vec0, hess); + hist1_vec1 = svadd_f64_m(pg64_upper, hist1_vec1, hess); + + // Store the updated histogram data back into memory + svst1_scatter_index(pg64_lower, hist_data, idx_bin_vec0_0, hist0_vec0); + svst1_scatter_index(pg64_upper, hist_data, idx_bin_vec0_1, hist0_vec1); + svst1_scatter_index(pg64_lower, hist_data, idx_bin_vec1_0, hist1_vec0); + svst1_scatter_index(pg64_upper, hist_data, idx_bin_vec1_1, hist1_vec1); + } +} +#endif + +// Returns true if SVE ISA is available on the current CPU (with caching) +bool check_sve_hw_support() { + static bool cached_sve_support = -1; + if (cached_sve_support == -1) { + int ret = prctl(PR_SVE_GET_VL); + if (ret == -1) { + cached_sve_support = 0; + } else { + cached_sve_support = 1; + } + } + return cached_sve_support; +} + +static bool sve_enabled = check_sve_hw_support(); + template void RowsWiseBuildHistKernel(Span gpair, Span row_indices, const GHistIndexMatrix &gmat, GHistRow hist) { @@ -223,24 +300,20 @@ void RowsWiseBuildHistKernel(Span gpair, Span gpair, Span(gr_index_local[j]) + (kAnyMissing ? 0 : offsets[j])); - auto hist_local = hist_data + idx_bin; - *(hist_local) += pgh_t[0]; - *(hist_local + 1) += pgh_t[1]; + if (sve_enabled) { + #ifdef XGBOOST_SVE_COMPILER_SUPPORT + UpdateHistogramWithSVE(row_size, gr_index_local, offsets, hist_data, p_gpair, idx_gh, two, + kAnyMissing); + #endif + } else { + // The trick with pgh_t buffer helps the compiler to generate faster binary. + const float pgh_t[] = {p_gpair[idx_gh], p_gpair[idx_gh + 1]}; + for (size_t j = 0; j < row_size; ++j) { + const uint32_t idx_bin = + two * (static_cast(gr_index_local[j]) + (kAnyMissing ? 0 : offsets[j])); + auto hist_local = hist_data + idx_bin; + *(hist_local) += pgh_t[0]; + *(hist_local + 1) += pgh_t[1]; + } } } } @@ -279,7 +359,9 @@ void ColsWiseBuildHistKernel(Span gpair, Span gpair, Span gpair, Span