diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index f5ecb94f68b1..001e17b510a3 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -95,7 +95,7 @@ jobs:
       run: |
         mkdir build
         cd build
-        cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX
+        cmake .. -DGOOGLE_TEST=ON -DUSE_DMLC_GTEST=ON -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_INSTALL_PREFIX=$CONDA_PREFIX
         make -j$(nproc)
     - name: Run gtest binary for SYCL
       run: |
diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml
index e6eec86c8606..83f0ad495fc3 100644
--- a/.github/workflows/python_tests.yml
+++ b/.github/workflows/python_tests.yml
@@ -294,7 +294,7 @@ jobs:
       run: |
         mkdir build
         cd build
-        cmake .. -DPLUGIN_SYCL=ON -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
+        cmake .. -DPLUGIN_SYCL=ON -DCMAKE_CXX_COMPILER=g++ -DCMAKE_C_COMPILER=gcc -DCMAKE_PREFIX_PATH=$CONDA_PREFIX
         make -j$(nproc)
     - name: Install Python package
       run: |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index bb526ad02911..f7cf8a6cfa87 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,8 +1,6 @@
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 
 if(PLUGIN_SYCL)
-  set(CMAKE_CXX_COMPILER  "g++")
-  set(CMAKE_C_COMPILER  "gcc")
   string(REPLACE " -isystem ${CONDA_PREFIX}/include" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 endif()
 
diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION
index 82d7011de3a4..98d31acf8c6c 100644
--- a/R-package/DESCRIPTION
+++ b/R-package/DESCRIPTION
@@ -66,6 +66,6 @@ Imports:
     data.table (>= 1.9.6),
     jsonlite (>= 1.0)
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 Encoding: UTF-8
 SystemRequirements: GNU make, C++17
diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R
index 0aa3cdcf1df0..30bf1f1ea149 100644
--- a/R-package/R/xgb.train.R
+++ b/R-package/R/xgb.train.R
@@ -102,6 +102,18 @@
 #'            It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be
 #'            \href{https://en.wikipedia.org/wiki/Tweedie_distribution#Applications}{Tweedie-distributed}.}
 #'   }
+#'
+#'  For custom objectives, one should pass a function taking as input the current predictions (as a numeric
+#'  vector or matrix) and the training data (as an `xgb.DMatrix` object) that will return a list with elements
+#'  `grad` and `hess`, which should be numeric vectors or matrices with number of rows matching to the numbers
+#'  of rows in the training data (same shape as the predictions that are passed as input to the function).
+#'  For multi-valued custom objectives, should have shape `[nrows, ntargets]`. Note that negative values of
+#'  the Hessian will be clipped, so one might consider using the expected Hessian (Fisher information) if the
+#'  objective is non-convex.
+#'
+#'  See the tutorials \href{https://xgboost.readthedocs.io/en/stable/tutorials/custom_metric_obj.html}{
+#'  Custom Objective and Evaluation Metric} and \href{https://xgboost.readthedocs.io/en/stable/tutorials/advanced_custom_obj}{
+#'  Advanced Usage of Custom Objectives} for more information about custom objectives.
 #'  }
 #'   \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
 #'   \item{ \code{eval_metric} evaluation metrics for validation data.
diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd
index 937020e0dd38..f641b1374420 100644
--- a/R-package/man/xgb.train.Rd
+++ b/R-package/man/xgb.train.Rd
@@ -144,6 +144,18 @@ It might be useful, e.g., for modeling insurance claims severity, or for any out
 It might be useful, e.g., for modeling total loss in insurance, or for any outcome that might be
 \href{https://en.wikipedia.org/wiki/Tweedie_distribution#Applications}{Tweedie-distributed}.}
 }
+
+For custom objectives, one should pass a function taking as input the current predictions (as a numeric
+vector or matrix) and the training data (as an \code{xgb.DMatrix} object) that will return a list with elements
+\code{grad} and \code{hess}, which should be numeric vectors or matrices with number of rows matching to the numbers
+of rows in the training data (same shape as the predictions that are passed as input to the function).
+For multi-valued custom objectives, should have shape \verb{[nrows, ntargets]}. Note that negative values of
+the Hessian will be clipped, so one might consider using the expected Hessian (Fisher information) if the
+objective is non-convex.
+
+See the tutorials \href{https://xgboost.readthedocs.io/en/stable/tutorials/custom_metric_obj.html}{
+Custom Objective and Evaluation Metric} and \href{https://xgboost.readthedocs.io/en/stable/tutorials/advanced_custom_obj}{
+Advanced Usage of Custom Objectives} for more information about custom objectives.
 }
 \item \code{base_score} the initial prediction score of all instances, global bias. Default: 0.5
 \item{ \code{eval_metric} evaluation metrics for validation data.
diff --git a/README.md b/README.md
index 220e94637fe1..b27cce673585 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@
 [Documentation](https://xgboost.readthedocs.org) |
 [Resources](demo/README.md) |
 [Contributors](CONTRIBUTORS.md) |
-[Release Notes](NEWS.md)
+[Release Notes](https://xgboost.readthedocs.io/en/latest/changes/index.html)
 
 XGBoost is an optimized distributed gradient boosting library designed to be highly ***efficient***, ***flexible*** and ***portable***.
 It implements machine learning algorithms under the [Gradient Boosting](https://en.wikipedia.org/wiki/Gradient_boosting) framework.
diff --git a/demo/guide-python/custom_softmax.py b/demo/guide-python/custom_softmax.py
index 36265cf4d6c5..2d2ebae2041b 100644
--- a/demo/guide-python/custom_softmax.py
+++ b/demo/guide-python/custom_softmax.py
@@ -6,7 +6,8 @@
 XGBoost returns transformed prediction for multi-class objective function.  More details
 in comments.
 
-See :doc:`/tutorials/custom_metric_obj` for detailed tutorial and notes.
+See :doc:`/tutorials/custom_metric_obj` and :doc:`/tutorials/advanced_custom_obj` for
+detailed tutorial and notes.
 
 '''
 
@@ -39,7 +40,9 @@ def softmax(x):
 
 
 def softprob_obj(predt: np.ndarray, data: xgb.DMatrix):
-    '''Loss function.  Computing the gradient and approximated hessian (diagonal).
+    '''Loss function. Computing the gradient and upper bound on the
+    Hessian with a diagonal structure for XGBoost (note that this is
+    not the true Hessian).
     Reimplements the `multi:softprob` inside XGBoost.
 
     '''
@@ -61,7 +64,7 @@ def softprob_obj(predt: np.ndarray, data: xgb.DMatrix):
 
     eps = 1e-6
 
-    # compute the gradient and hessian, slow iterations in Python, only
+    # compute the gradient and hessian upper bound, slow iterations in Python, only
     # suitable for demo.  Also the one in native XGBoost core is more robust to
     # numeric overflow as we don't do anything to mitigate the `exp` in
     # `softmax` here.
diff --git a/demo/rmm_plugin/README.rst b/demo/rmm_plugin/README.rst
index 4742507d240d..28b816eb2574 100644
--- a/demo/rmm_plugin/README.rst
+++ b/demo/rmm_plugin/README.rst
@@ -1,5 +1,5 @@
-Using XGBoost with RAPIDS Memory Manager (RMM) plugin (EXPERIMENTAL)
-====================================================================
+Using XGBoost with RAPIDS Memory Manager (RMM) plugin
+=====================================================
 
 `RAPIDS Memory Manager (RMM) <https://github.com/rapidsai/rmm>`__ library provides a
 collection of efficient memory allocators for NVIDIA GPUs. It is now possible to use
@@ -47,5 +47,15 @@ the global configuration ``use_rmm``:
   with xgb.config_context(use_rmm=True):
     clf = xgb.XGBClassifier(tree_method="hist", device="cuda")
 
-Depending on the choice of memory pool size or type of allocator, this may have negative
-performance impact.
+Depending on the choice of memory pool size and the type of the allocator, this can add
+more consistency to memory usage but with slightly degraded performance impact.
+
+*******************************
+No Device Ordinal for Multi-GPU
+*******************************
+
+Since with RMM the memory pool is pre-allocated on a specific device, changing the CUDA
+device ordinal in XGBoost can result in memory error ``cudaErrorIllegalAddress``. Use the
+``CUDA_VISIBLE_DEVICES`` environment variable instead of the ``device="cuda:1"`` parameter
+for selecting device. For distributed training, the distributed computing frameworks like
+``dask-cuda`` are responsible for device management.
\ No newline at end of file
diff --git a/doc/changes/index.rst b/doc/changes/index.rst
index 68eead924d71..09bc215075e4 100644
--- a/doc/changes/index.rst
+++ b/doc/changes/index.rst
@@ -2,6 +2,8 @@
 Release Notes
 #############
 
+For release notes prior to the 2.1 release, please see `news <https://github.com/dmlc/xgboost/blob/master/NEWS.md>`__ .
+
 .. toctree::
   :maxdepth: 1
   :caption: Contents:
diff --git a/doc/parameter.rst b/doc/parameter.rst
index 00f0eaea6193..a776559223f4 100644
--- a/doc/parameter.rst
+++ b/doc/parameter.rst
@@ -25,7 +25,11 @@ Global Configuration
 The following parameters can be set in the global scope, using :py:func:`xgboost.config_context()` (Python) or ``xgb.set.config()`` (R).
 
 * ``verbosity``: Verbosity of printing messages. Valid values of 0 (silent), 1 (warning), 2 (info), and 3 (debug).
-* ``use_rmm``: Whether to use RAPIDS Memory Manager (RMM) to allocate GPU memory. This option is only applicable when XGBoost is built (compiled) with the RMM plugin enabled. Valid values are ``true`` and ``false``.
+
+* ``use_rmm``: Whether to use RAPIDS Memory Manager (RMM) to allocate cache GPU
+  memory. The primary memory is always allocated on the RMM pool when XGBoost is built
+  (compiled) with the RMM plugin enabled. Valid values are ``true`` and ``false``. See
+  :doc:`/python/rmm-examples/index` for details.
 
 ******************
 General Parameters
diff --git a/doc/python/python_api.rst b/doc/python/python_api.rst
index 4ba520fe46eb..86da4fda0cfc 100644
--- a/doc/python/python_api.rst
+++ b/doc/python/python_api.rst
@@ -14,6 +14,8 @@ Global Configuration
 
 .. autofunction:: xgboost.get_config
 
+.. autofunction:: xgboost.build_info
+
 Core Data Structure
 -------------------
 .. automodule:: xgboost.core
diff --git a/doc/tutorials/advanced_custom_obj.rst b/doc/tutorials/advanced_custom_obj.rst
new file mode 100644
index 000000000000..b78cdc292eff
--- /dev/null
+++ b/doc/tutorials/advanced_custom_obj.rst
@@ -0,0 +1,720 @@
+###################################
+Advanced Usage of Custom Objectives
+###################################
+
+**Contents**
+
+.. contents::
+  :backlinks: none
+  :local:
+
+********
+Overview
+********
+
+XGBoost allows optimizing custom user-defined functions based on
+gradients and Hessians provided by the user for the desired objective function.
+
+In order for a custom objective to work as intended:
+
+- The function to optimize must be smooth and twice differentiable.
+- The function must be additive with respect to rows / observations,
+  such as a likelihood function with i.i.d. assumptions.
+- The range of the scores for the function must be unbounded
+  (i.e. it should not work exclusively with positive numbers, for example).
+- The function must be convex. Note that, if the Hessian has negative
+  values, they will be clipped, which will likely result in a model
+  that does not fit the function well.
+- For multi-output objectives, there should not be dependencies between
+  different targets (i.e. Hessian should be diagonal for each row).
+
+
+Some of these limitations can nevertheless be worked around by foregoing
+the true Hessian of the function, using something else instead such as an
+approximation with better properties - convergence might be slower when
+not using the true Hessian of a function, but many theoretical guarantees
+should still hold and result in usable models. For example, XGBoost's
+internal implementation of multionomial logistic regression uses an upper
+bound on the Hessian with diagonal structure instead of the true Hessian
+which is a full square matrix for each row in the data.
+
+This tutorial provides some suggestions for use-cases that do not perfectly
+fit the criteria outlined above, by showing how to solve a Dirichlet regression
+parameterized by concentrations.
+
+A Dirichlet regression model poses certain challenges for XGBoost:
+
+- Concentration parameters must be positive. An easy way to achieve this is
+  by applying an 'exp' transform on raw unbounded values, but in such case
+  the objective becomes non-convex. Furthermore, note that this function is
+  not in the exponential family, unlike typical distributions used for GLM
+  models.
+- The Hessian has dependencies between targets - that is, for a Dirichlet
+  distribution with 'k' parameters, each row will have a full Hessian matrix
+  of dimensions ``[k, k]``.
+- An optimal intercept for this type of model would involve a vector of
+  values rather than the same value for every target.
+
+In order to use this type of model as a custom objetive:
+
+- It's possible to use the expected Hessian (a.k.a. the Fisher information
+  matrix or expected information) instead of the true Hessian. The expected
+  Hessian is always positive semi-definite for an additive likelihood, even
+  if the true Hessian isn't.
+- It's possible to use an upper bound on the expected Hessian with a diagonal
+  structure, such that a second-order approximation under this diagonal
+  bound would always yield greater or equal function values than under the
+  non-diagonal expected Hessian.
+- Since the ``base_score`` parameter that XGBoost uses for an intercept is
+  limited to a scalar, one can use the ``base_margin`` functionality instead,
+  but note that using it requires a bit more effort.
+
+*****************************
+Dirichlet Regression Formulae
+*****************************
+
+The Dirichlet distribution is a generalization of the Beta distribution to
+multiple dimensions. It models proportions data in which the values sum to
+1, and is typically used as part of composite models (e.g. Dirichlet-multinomial)
+or as a prior in Bayesian models, but it also can be used on its own for
+proportions data for example.
+
+Its likelihood for a given observation with values ``y`` and a given prediction ``x``
+is given as follows:
+
+.. math::
+    L(\mathbf{y} | \mathbf{x}) = \frac{1}{\beta(\mathbf{x})} \prod_{i=1}^k y_i^{x_i - 1}
+
+Where:
+
+.. math::
+  \beta(\mathbf{x}) = \frac{ \prod_{i=1}^k \Gamma(x_i) }{\Gamma( \sum_{i=1}^k x_i )}
+
+
+In this case, we want to optimize the negative of the log-likelihood summed across rows.
+The resulting function, gradient and Hessian could be implemented as follows:
+
+.. code-block:: python
+    :caption: Python
+
+    import numpy as np
+    from scipy.special import loggamma, psi as digamma, polygamma
+    trigamma = lambda x: polygamma(1, x)
+
+    def dirichlet_fun(pred: np.ndarray, Y: np.ndarray) -> float:
+        epred = np.exp(pred)
+        sum_epred = np.sum(epred, axis=1, keepdims=True)
+        return (
+            loggamma(epred).sum()
+            - loggamma(sum_epred).sum()
+            - np.sum(np.log(Y) * (epred - 1))
+        )
+    def dirichlet_grad(pred: np.ndarray, Y: np.ndarray) -> np.ndarray:
+        epred = np.exp(pred)
+        return epred * (
+            digamma(epred)
+            - digamma(np.sum(epred, axis=1, keepdims=True))
+            - np.log(Y)
+        )
+    def dirichlet_hess(pred: np.ndarray, Y: np.ndarray) -> np.ndarray:
+        epred = np.exp(pred)
+        grad = dirichlet_grad(pred, Y)
+        k = Y.shape[1]
+        H = np.empty((pred.shape[0], k, k))
+        for row in range(pred.shape[0]):
+            H[row, :, :] = (
+                - trigamma(epred[row].sum()) * np.outer(epred[row], epred[row])
+                + np.diag(grad[row] + trigamma(epred[row]) * epred[row] ** 2)
+            )
+        return H
+
+.. code-block:: r
+    :caption: R
+
+    softmax <- function(x) {
+        max.x <- max(x)
+        e <- exp(x - max.x)
+        return(e / sum(e))
+    }
+
+    dirichlet.fun <- function(pred, y) {
+        epred <- exp(pred)
+        sum_epred <- rowSums(epred)
+        return(
+            sum(lgamma(epred))
+            - sum(lgamma(sum_epred))
+            - sum(log(y) * (epred - 1))
+        )
+    }
+
+    dirichlet.grad <- function(pred, y) {
+        epred <- exp(pred)
+        return(
+            epred * (
+                digamma(epred)
+                - digamma(rowSums(epred))
+                - log(y)
+            )
+        )
+    }
+
+    dirichlet.hess <- function(pred, y) {
+        epred <- exp(pred)
+        grad <- dirichlet.grad(pred, y)
+        k <- ncol(y)
+        H <- array(dim = c(nrow(y), k, k))
+        for (row in seq_len(nrow(y))) {
+            H[row, , ] <- (
+                - trigamma(sum(epred[row,])) * tcrossprod(epred[row,])
+                + diag(grad[row,] + trigamma(epred[row,]) * epred[row,]^2)
+            )
+        }
+        return(H)
+    }
+
+
+Convince yourself that the implementation is correct:
+
+.. code-block:: python
+    :caption: Python
+
+    from math import isclose
+    from scipy import stats
+    from scipy.optimize import check_grad
+    from scipy.special import softmax
+
+    def gen_random_dirichlet(rng: np.random.Generator, m: int, k: int):
+        alpha = np.exp(rng.standard_normal(size=k))
+        return rng.dirichlet(alpha, size=m)
+    
+    def test_dirichlet_fun_grad_hess():
+        k = 3
+        m = 10
+        rng = np.random.default_rng(seed=123)
+        Y = gen_random_dirichlet(rng, m, k)
+        x0 = rng.standard_normal(size=k)
+        for row in range(Y.shape[0]):
+            fun_row = dirichlet_fun(x0.reshape((1,-1)), Y[[row]])
+            ref_logpdf = stats.dirichlet.logpdf(
+                Y[row] / Y[row].sum(), # <- avoid roundoff error
+                np.exp(x0),
+            )
+            assert isclose(fun_row, -ref_logpdf)
+
+            gdiff = check_grad(
+                lambda pred: dirichlet_fun(pred.reshape((1,-1)), Y[[row]]),
+                lambda pred: dirichlet_grad(pred.reshape((1,-1)), Y[[row]]),
+                x0
+            )
+            assert gdiff <= 1e-6
+
+            H_numeric = np.empty((k,k))
+            eps = 1e-7
+            for ii in range(k):
+                x0_plus_eps = x0.reshape((1,-1)).copy()
+                x0_plus_eps[0,ii] += eps
+                for jj in range(k):
+                    H_numeric[ii, jj] = (
+                        dirichlet_grad(x0_plus_eps, Y[[row]])[0][jj]
+                        - dirichlet_grad(x0.reshape((1,-1)), Y[[row]])[0][jj]
+                    ) / eps
+            H = dirichlet_hess(x0.reshape((1,-1)), Y[[row]])[0]
+            np.testing.assert_almost_equal(H, H_numeric, decimal=6)
+    test_dirichlet_fun_grad_hess()
+
+
+.. code-block:: r
+    :caption: R
+
+    library(DirichletReg)
+    library(testthat)
+
+    test_that("dirichlet formulae", {
+        k <- 3L
+        m <- 10L
+        set.seed(123)
+        alpha <- exp(rnorm(k))
+        y <- rdirichlet(m, alpha)
+        x0 <- rnorm(k)
+        
+        for (row in seq_len(m)) {
+            logpdf <- dirichlet.fun(matrix(x0, nrow=1), y[row,,drop=F])
+            ref_logpdf <- ddirichlet(y[row,,drop=F], exp(x0), log = T)
+            expect_equal(logpdf, -ref_logpdf)
+            
+            eps <- 1e-7
+            grad_num <- numeric(k)
+            for (col in seq_len(k)) {
+                xplus <- x0
+                xplus[col] <- x0[col] + eps
+                grad_num[col] <- (
+                    dirichlet.fun(matrix(xplus, nrow=1), y[row,,drop=F])
+                    - dirichlet.fun(matrix(x0, nrow=1), y[row,,drop=F])
+                ) / eps
+            }
+            
+            grad <- dirichlet.grad(matrix(x0, nrow=1), y[row,,drop=F])
+            expect_equal(grad |> as.vector(), grad_num, tolerance=1e-6)
+            
+            H_numeric <- array(dim=c(k, k))
+            for (ii in seq_len(k)) {
+                xplus <- x0
+                xplus[ii] <- x0[ii] + eps
+                for (jj in seq_len(k)) {
+                    H_numeric[ii, jj] <- (
+                        dirichlet.grad(matrix(xplus, nrow=1), y[row,,drop=F])[1, jj]
+                        - grad[1L, jj]
+                    ) / eps
+                }
+            }
+            
+            H <- dirichlet.hess(matrix(xplus, nrow=1), y[row,,drop=F])
+            expect_equal(H[1,,], H_numeric, tolerance=1e-6)
+        }
+    })
+
+******************************************
+Dirichlet Regression as Objective Function
+******************************************
+
+As mentioned earlier, the Hessian of this function is problematic for
+XGBoost: it can have a negative determinant, and might even have negative
+values in the diagonal, which is problematic for optimization methods - in
+XGBoost, those values would be clipped and the resulting model might not
+end up producing sensible predictions.
+
+A potential workaround is to use the expected Hessian instead - that is,
+the expected outer product of the gradient if the response variable were
+distributed according to what is predicted. See the Wikipedia article
+for more information:
+
+`<https://en.wikipedia.org/wiki/Fisher_information>`_
+
+In general, for objective functions in the exponential family, this is easy
+to obtain from the gradient of the link function and the variance of the
+probability distribution, but for other functions in general, it might
+involve other types of calculations (e.g. covariances and covariances of
+logarithms for Dirichlet).
+
+It nevertheless results in a form very similar to the Hessian. One can also
+see from the differences here that, at an optimal point (gradient being zero),
+the expected and true Hessian for Dirichlet will match, which is a nice
+property for optimization (i.e. the Hessian will be positive at a stationary
+point, which means it will be a minimum rather than a maximum or saddle point).
+
+.. code-block:: python
+    :caption: Python
+
+    def dirichlet_expected_hess(pred: np.ndarray) -> np.ndarray:
+        epred = np.exp(pred)
+        k = pred.shape[1]
+        Ehess = np.empty((pred.shape[0], k, k))
+        for row in range(pred.shape[0]):
+            Ehess[row, :, :] = (
+                - trigamma(epred[row].sum()) * np.outer(epred[row], epred[row])
+                + np.diag(trigamma(epred[row]) * epred[row] ** 2)
+            )
+        return Ehess
+    def test_dirichlet_expected_hess():
+        k = 3
+        rng = np.random.default_rng(seed=123)
+        x0 = rng.standard_normal(size=k)
+        y_sample = rng.dirichlet(np.exp(x0), size=5_000_000)
+        x_broadcast = np.broadcast_to(x0, (y_sample.shape[0], k))
+        g_sample = dirichlet_grad(x_broadcast, y_sample)
+        ref = (g_sample.T @ g_sample) / y_sample.shape[0]
+        Ehess = dirichlet_expected_hess(x0.reshape((1,-1)))[0]
+        np.testing.assert_almost_equal(Ehess, ref, decimal=2)
+    test_dirichlet_expected_hess()
+
+.. code-block:: r
+    :caption: R
+
+    dirichlet.expected.hess <- function(pred) {
+        epred <- exp(pred)
+        k <- ncol(pred)
+        H <- array(dim = c(nrow(pred), k, k))
+        for (row in seq_len(nrow(pred))) {
+            H[row, , ] <- (
+                - trigamma(sum(epred[row,])) * tcrossprod(epred[row,])
+                + diag(trigamma(epred[row,]) * epred[row,]^2)
+            )
+        }
+        return(H)
+    }
+
+    test_that("expected hess", {
+        k <- 3L
+        set.seed(123)
+        x0 <- rnorm(k)
+        alpha <- exp(x0)
+        n.samples <- 5e6
+        y.samples <- rdirichlet(n.samples, alpha)
+        
+        x.broadcast <- rep(x0, n.samples) |> matrix(ncol=k, byrow=T)
+        grad.samples <- dirichlet.grad(x.broadcast, y.samples)
+        ref <- crossprod(grad.samples) / n.samples
+        Ehess <- dirichlet.expected.hess(matrix(x0, nrow=1))
+        expect_equal(Ehess[1,,], ref, tolerance=1e-2)
+    })
+
+But note that this is still not usable for XGBoost, since the expected
+Hessian, just like the true Hessian, has shape ``[nrows, k, k]``, while
+XGBoost requires something with shape ``[k, k]``.
+
+One may use the diagonal of the expected Hessian for each row, but it's
+possible to do better: one can use instead an upper bound with diagonal
+structure, since it should lead to better convergence properties, just like
+for other Hessian-based optimization methods.
+
+In the absence of any obvious way of obtaining an upper bound, a possibility
+here is to construct such a bound numerically based directly on the definition
+of a diagonally dominant matrix:
+
+`<https://en.wikipedia.org/wiki/Diagonally_dominant_matrix>`_
+
+That is: take the absolute value of the expected Hessian for each row of the data,
+and sum by rows of the ``[k, k]``-shaped Hessian for that row in the data:
+
+.. code-block:: python
+    :caption: Python
+
+    def dirichlet_diag_upper_bound_expected_hess(
+        pred: np.ndarray, Y: np.ndarray
+    ) -> np.ndarray:
+        Ehess = dirichlet_expected_hess(pred)
+        diag_bound_Ehess = np.empty((pred.shape[0], Y.shape[1]))
+        for row in range(pred.shape[0]):
+            diag_bound_Ehess[row, :] = np.abs(Ehess[row, :, :]).sum(axis=1)
+        return diag_bound_Ehess
+
+.. code-block:: r
+    :caption: R
+
+    dirichlet.diag.upper.bound.expected.hess <- function(pred, y) {
+        Ehess <- dirichlet.expected.hess(pred)
+        diag.bound.Ehess <- array(dim=dim(pred))
+        for (row in seq_len(nrow(pred))) {
+            diag.bound.Ehess[row,] <- abs(Ehess[row,,]) |> rowSums()
+        }
+        return(diag.bound.Ehess)
+    }
+
+(*note: the calculation can be made more efficiently than what is shown here
+by not calculating the full matrix, and in R, by making the rows be the last
+dimension and transposing after the fact*)
+
+With all these pieces in place, one can now frame this model into the format
+required for XGBoost's custom objectives:
+
+.. code-block:: python
+    :caption: Python
+
+    import xgboost as xgb
+    from typing import Tuple
+
+    def dirichlet_xgb_objective(
+        pred: np.ndarray, dtrain: xgb.DMatrix
+    ) -> Tuple[np.ndarray, np.ndarray]:
+        Y = dtrain.get_label().reshape(pred.shape)
+        return (
+            dirichlet_grad(pred, Y),
+            dirichlet_diag_upper_bound_expected_hess(pred, Y),
+        )
+
+.. code-block:: r
+    :caption: R
+
+    library(xgboost)
+    
+    dirichlet.xgb.objective <- function(pred, dtrain) {
+        y <- getinfo(dtrain, "label")
+        return(
+            list(
+                grad = dirichlet.grad(pred, y),
+                hess = dirichlet.diag.upper.bound.expected.hess(pred, y)
+            )
+        )
+    }
+
+And for an evaluation metric monitoring based on the Dirichlet log-likelihood:
+
+.. code-block:: python
+    :caption: Python
+
+    def dirichlet_eval_metric(
+        pred: np.ndarray, dtrain: xgb.DMatrix
+    ) -> Tuple[str, float]:
+        Y = dtrain.get_label().reshape(pred.shape)
+        return "dirichlet_ll", dirichlet_fun(pred, Y)
+
+.. code-block:: r
+    :caption: R
+
+    dirichlet.eval.metric <- function(pred, dtrain) {
+        y <- getinfo(dtrain, "label")
+        ll <- dirichlet.fun(pred, y)
+        return(
+            list(
+                metric = "dirichlet_ll",
+                value = ll
+            )
+        )
+    }
+
+*****************
+Practical Example
+*****************
+
+A good source for test datasets for proportions data is the R package ``DirichletReg``:
+
+`<https://cran.r-project.org/package=DirichletReg>`_
+
+For this example, we'll now use the Arctic Lake dataset
+(Aitchison, J. (2003). The Statistical Analysis of Compositional Data. The Blackburn Press, Caldwell, NJ.),
+taken from the ``DirichletReg`` R package, which consists of 39 rows with one predictor variable 'depth'
+and a three-valued response variable denoting the sediment composition of the measurements in this arctic
+lake (sand, silt, clay).
+
+The data:
+
+.. code-block:: python
+    :caption: Python
+    
+    # depth
+    X = np.array([
+        10.4,11.7,12.8,13,15.7,16.3,18,18.7,20.7,22.1,
+        22.4,24.4,25.8,32.5,33.6,36.8,37.8,36.9,42.2,47,
+        47.1,48.4,49.4,49.5,59.2,60.1,61.7,62.4,69.3,73.6,
+        74.4,78.5,82.9,87.7,88.1,90.4,90.6,97.7,103.7,
+    ]).reshape((-1,1))
+    # sand, silt, clay
+    Y = np.array([
+        [0.775,0.195,0.03], [0.719,0.249,0.032], [0.507,0.361,0.132],
+        [0.522,0.409,0.066], [0.7,0.265,0.035], [0.665,0.322,0.013],
+        [0.431,0.553,0.016], [0.534,0.368,0.098], [0.155,0.544,0.301],
+        [0.317,0.415,0.268], [0.657,0.278,0.065], [0.704,0.29,0.006],
+        [0.174,0.536,0.29], [0.106,0.698,0.196], [0.382,0.431,0.187],
+        [0.108,0.527,0.365], [0.184,0.507,0.309], [0.046,0.474,0.48],
+        [0.156,0.504,0.34], [0.319,0.451,0.23], [0.095,0.535,0.37],
+        [0.171,0.48,0.349], [0.105,0.554,0.341], [0.048,0.547,0.41],
+        [0.026,0.452,0.522], [0.114,0.527,0.359], [0.067,0.469,0.464],
+        [0.069,0.497,0.434], [0.04,0.449,0.511], [0.074,0.516,0.409],
+        [0.048,0.495,0.457], [0.045,0.485,0.47], [0.066,0.521,0.413],
+        [0.067,0.473,0.459], [0.074,0.456,0.469], [0.06,0.489,0.451],
+        [0.063,0.538,0.399], [0.025,0.48,0.495], [0.02,0.478,0.502],
+    ])
+
+.. code-block:: r
+    :caption: R
+
+    data("ArcticLake", package="DirichletReg")
+    x <- ArcticLake[, c("depth"), drop=F]
+    y <- ArcticLake[, c("sand", "silt", "clay")] |> as.matrix()
+
+Fitting an XGBoost model and making predictions:
+
+.. code-block:: python
+    :caption: Python
+    
+    from typing import Dict, List
+    
+    dtrain = xgb.DMatrix(X, label=Y)
+    results: Dict[str, Dict[str, List[float]]] = {}
+    booster = xgb.train(
+        params={
+            "tree_method": "hist",
+            "num_target": Y.shape[1],
+            "base_score": 0,
+            "disable_default_eval_metric": True,
+            "max_depth": 3,
+            "seed": 123,
+        },
+        dtrain=dtrain,
+        num_boost_round=10,
+        obj=dirichlet_xgb_objective,
+        evals=[(dtrain, "Train")],
+        evals_result=results,
+        custom_metric=dirichlet_eval_metric,
+    )
+    yhat = softmax(booster.inplace_predict(X), axis=1)
+
+.. code-block:: r
+    :caption: R
+
+    dtrain <- xgb.DMatrix(x, y)
+    booster <- xgb.train(
+        params = list(
+            tree_method="hist",
+            num_target=ncol(y),
+            base_score=0,
+            disable_default_eval_metric=TRUE,
+            max_depth=3,
+            seed=123
+        ),
+        data = dtrain,
+        nrounds = 10,
+        obj = dirichlet.xgb.objective,
+        evals = list(Train=dtrain),
+        eval_metric = dirichlet.eval.metric
+    )
+    raw.pred <- predict(booster, x, reshape=TRUE)
+    yhat <- apply(raw.pred, 1, softmax) |> t()
+
+
+Should produce an evaluation log as follows (note: the function is decreasing as
+expected - but unlike other objectives, the minimum value here can reach below zero):
+
+.. code-block:: none
+
+    [0] Train-dirichlet_ll:-40.25009
+    [1] Train-dirichlet_ll:-47.69122
+    [2] Train-dirichlet_ll:-52.64620
+    [3] Train-dirichlet_ll:-56.36977
+    [4] Train-dirichlet_ll:-59.33048
+    [5] Train-dirichlet_ll:-61.93359
+    [6] Train-dirichlet_ll:-64.17280
+    [7] Train-dirichlet_ll:-66.29709
+    [8] Train-dirichlet_ll:-68.21001
+    [9] Train-dirichlet_ll:-70.03442
+
+One can confirm that the obtained ``yhat`` resembles the actual concentrations
+to a large degree, beyond what would be expected from random predictions by a
+simple look at both ``yhat`` and ``Y``.
+
+For better results, one might want to add an intercept. XGBoost only
+allows using scalars for intercepts, but for a vector-valued model,
+the optimal intercept should also have vector form.
+
+This can be done by supplying ``base_margin`` instead - unlike the
+intercept, one must specifically supply values for every row here,
+and said ``base_margin`` must be supplied again at the moment of making
+predictions (i.e. does not get added automatically like ``base_score``
+does).
+
+For the case of a Dirichlet model, the optimal intercept can be obtained
+efficiently using a general solver (e.g. SciPy's Newton solver) with
+dedicated likelihood, gradient and Hessian functions for just the intercept part.
+Further, note that if one frames it instead as bounded optimization without
+applying 'exp' transform to the concentrations, it becomes instead a convex
+problem, for which the true Hessian can be used without issues in other
+classes of solvers.
+
+For simplicity, this example will nevertheless reuse the same likelihood
+and gradient functions that were defined earlier alongside with SciPy's / R's
+L-BFGS solver to obtain the optimal vector-valued intercept:
+
+.. code-block:: python
+    :caption: Python
+
+    from scipy.optimize import minimize
+
+    def get_optimal_intercepts(Y: np.ndarray) -> np.ndarray:
+        k = Y.shape[1]
+        res = minimize(
+            fun=lambda pred: dirichlet_fun(
+                np.broadcast_to(pred, (Y.shape[0], k)),
+                Y
+            ),
+            x0=np.zeros(k),
+            jac=lambda pred: dirichlet_grad(
+                np.broadcast_to(pred, (Y.shape[0], k)),
+                Y
+            ).sum(axis=0)
+        )
+        return res["x"]
+    intercepts = get_optimal_intercepts(Y)
+
+.. code-block:: r
+    :caption: R
+
+    get.optimal.intercepts <- function(y) {
+        k <- ncol(y)
+        broadcast.vec <- function(x) rep(x, nrow(y)) |> matrix(ncol=k, byrow=T)
+        res <- optim(
+            par = numeric(k),
+            fn = function(x) dirichlet.fun(broadcast.vec(x), y),
+            gr = function(x) dirichlet.grad(broadcast.vec(x), y) |> colSums(),
+            method = "L-BFGS-B"
+        )
+        return(res$par)
+    }
+    intercepts <- get.optimal.intercepts(y)
+
+
+Now fitting a model again, this time with the intercept:
+
+.. code-block:: python
+    :caption: Python
+
+    base_margin = np.broadcast_to(intercepts, Y.shape)
+    dtrain_w_intercept = xgb.DMatrix(X, label=Y, base_margin=base_margin)
+    results: Dict[str, Dict[str, List[float]]] = {}
+    booster = xgb.train(
+        params={
+            "tree_method": "hist",
+            "num_target": Y.shape[1],
+            "base_score": 0,
+            "disable_default_eval_metric": True,
+            "max_depth": 3,
+            "seed": 123,
+        },
+        dtrain=dtrain_w_intercept,
+        num_boost_round=10,
+        obj=dirichlet_xgb_objective,
+        evals=[(dtrain, "Train")],
+        evals_result=results,
+        custom_metric=dirichlet_eval_metric,
+    )
+    yhat = softmax(
+        booster.predict(
+            xgb.DMatrix(X, base_margin=base_margin)
+        ),
+        axis=1
+    )
+
+.. code-block:: r
+    :caption: R
+
+    base.margin <- rep(intercepts, nrow(y)) |> matrix(nrow=nrow(y), byrow=T)
+    dtrain <- xgb.DMatrix(x, y, base_margin=base.margin)
+    booster <- xgb.train(
+        params = list(
+            tree_method="hist",
+            num_target=ncol(y),
+            base_score=0,
+            disable_default_eval_metric=TRUE,
+            max_depth=3,
+            seed=123
+        ),
+        data = dtrain,
+        nrounds = 10,
+        obj = dirichlet.xgb.objective,
+        evals = list(Train=dtrain),
+        eval_metric = dirichlet.eval.metric
+    )
+    raw.pred <- predict(
+        booster,
+        x,
+        base_margin=base.margin,
+        reshape=TRUE
+    )
+    yhat <- apply(raw.pred, 1, softmax) |> t()
+
+.. code-block:: none
+
+    [0] Train-dirichlet_ll:-37.01861
+    [1] Train-dirichlet_ll:-42.86120
+    [2] Train-dirichlet_ll:-46.55133
+    [3] Train-dirichlet_ll:-49.15111
+    [4] Train-dirichlet_ll:-51.02638
+    [5] Train-dirichlet_ll:-52.53880
+    [6] Train-dirichlet_ll:-53.77409
+    [7] Train-dirichlet_ll:-54.88851
+    [8] Train-dirichlet_ll:-55.95961
+    [9] Train-dirichlet_ll:-56.95497
+
+For this small example problem, predictions should be very similar between the
+two and the version without intercepts achieved a lower objective function in the
+training data (for the Python version at least), but for more serious usage with
+real-world data, one is likely to observe better results when adding the intercepts.
diff --git a/doc/tutorials/custom_metric_obj.rst b/doc/tutorials/custom_metric_obj.rst
index 36bd0c8d65d5..51491e85c656 100644
--- a/doc/tutorials/custom_metric_obj.rst
+++ b/doc/tutorials/custom_metric_obj.rst
@@ -15,7 +15,7 @@ Overview
 XGBoost is designed to be an extensible library.  One way to extend it is by providing our
 own objective function for training and corresponding metric for performance monitoring.
 This document introduces implementing a customized elementwise evaluation metric and
-objective for XGBoost.  Although the introduction uses Python for demonstration, the
+objective for XGBoost. Although the introduction uses Python for demonstration, the
 concepts should be readily applicable to other language bindings.
 
 .. note::
@@ -23,6 +23,9 @@ concepts should be readily applicable to other language bindings.
    * The ranking task does not support customized functions.
    * Breaking change was made in XGBoost 1.6.
 
+See also the advanced usage example for more information about limitations and
+workarounds for more complex objetives: :doc:`/tutorials/advanced_custom_obj`
+
 In the following two sections, we will provide a step by step walk through of implementing
 the ``Squared Log Error (SLE)`` objective function:
 
diff --git a/doc/tutorials/index.rst b/doc/tutorials/index.rst
index c82abf43f452..eca01e1ddeb4 100644
--- a/doc/tutorials/index.rst
+++ b/doc/tutorials/index.rst
@@ -30,5 +30,6 @@ See `Awesome XGBoost <https://github.com/dmlc/xgboost/tree/master/demo>`_ for mo
   input_format
   param_tuning
   custom_metric_obj
+  advanced_custom_obj
   intercept
   privacy_preserving
\ No newline at end of file
diff --git a/doc/tutorials/learning_to_rank.rst b/doc/tutorials/learning_to_rank.rst
index 74e52e1561aa..4d2cbad4aa47 100644
--- a/doc/tutorials/learning_to_rank.rst
+++ b/doc/tutorials/learning_to_rank.rst
@@ -72,8 +72,11 @@ Please note that, as of writing, there's no learning-to-rank interface in scikit
 .. code-block:: python
 
   import pandas as pd
+
+  # `X`, `qid`, and `y` are from the previous snippet, they are all sorted by the `sorted_idx`.
   df = pd.DataFrame(X, columns=[str(i) for i in range(X.shape[1])])
-  df["qid"] = qid[sorted_idx]
+  df["qid"] = qid
+
   ranker.fit(df, y)  # No need to pass qid as a separate argument
 
   from sklearn.model_selection import StratifiedGroupKFold, cross_val_score
diff --git a/include/xgboost/linalg.h b/include/xgboost/linalg.h
index 8d23908e162b..ca5188a001c0 100644
--- a/include/xgboost/linalg.h
+++ b/include/xgboost/linalg.h
@@ -669,13 +669,13 @@ auto MakeVec(common::Span<T> data, DeviceOrd device = DeviceOrd::CPU()) {
 
 template <typename T>
 auto MakeVec(HostDeviceVector<T> *data) {
-  return MakeVec(data->Device().IsCPU() ? data->HostPointer() : data->DevicePointer(), data->Size(),
-                 data->Device());
+  return MakeVec(data->Device().IsCUDA() ? data->DevicePointer() : data->HostPointer(),
+                 data->Size(), data->Device());
 }
 
 template <typename T>
 auto MakeVec(HostDeviceVector<T> const *data) {
-  return MakeVec(data->Device().IsCPU() ? data->ConstHostPointer() : data->ConstDevicePointer(),
+  return MakeVec(data->Device().IsCUDA() ? data->ConstDevicePointer() : data->ConstHostPointer(),
                  data->Size(), data->Device());
 }
 
diff --git a/plugin/CMakeLists.txt b/plugin/CMakeLists.txt
index 5d20e120e902..c0c31f3a100d 100644
--- a/plugin/CMakeLists.txt
+++ b/plugin/CMakeLists.txt
@@ -10,14 +10,14 @@ if(PLUGIN_SYCL)
     target_compile_definitions(plugin_sycl PUBLIC -DXGBOOST_USE_SYCL=1)
     target_link_libraries(plugin_sycl PUBLIC -fsycl)
     set_target_properties(plugin_sycl PROPERTIES
-    COMPILE_FLAGS -fsycl
+    COMPILE_FLAGS "-fsycl -fno-sycl-id-queries-fit-in-int"
     CXX_STANDARD 17
     CXX_STANDARD_REQUIRED ON
     POSITION_INDEPENDENT_CODE ON)
   if(USE_OPENMP)
     find_package(OpenMP REQUIRED)
     set_target_properties(plugin_sycl PROPERTIES
-    COMPILE_FLAGS "-fsycl -qopenmp")
+    COMPILE_FLAGS "-fsycl -fno-sycl-id-queries-fit-in-int -qopenmp")
   endif()
   # Get compilation and link flags of plugin_sycl and propagate to objxgboost
   target_link_libraries(objxgboost PUBLIC plugin_sycl)
diff --git a/plugin/federated/federated_plugin.cc b/plugin/federated/federated_plugin.cc
index f2b310553662..a7d6943153a4 100644
--- a/plugin/federated/federated_plugin.cc
+++ b/plugin/federated/federated_plugin.cc
@@ -79,7 +79,7 @@ void FederatedPluginMock::Reset(common::Span<std::uint32_t const> cutptrs,
     auto hist_raw = hist_buffer.subspan(i * hist_size, hist_size);
     auto hist =
         common::Span{reinterpret_cast<GradientPairPrecise*>(hist_raw.data()), hist_raw.size() / 2};
-    common::RowSetCollection::Elem row_indices{rowptrs[i], rowptrs[i] + sizes[i], nids[i]};
+    common::Span row_indices{rowptrs[i], rowptrs[i] + sizes[i]};
     if (gmat_.IsDense()) {
       common::BuildHist<false>(gpair, row_indices, gmat_, hist, false);
     } else {
diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py
index 7e0ae793ba6e..bd196e2e59f9 100644
--- a/python-package/xgboost/data.py
+++ b/python-package/xgboost/data.py
@@ -458,7 +458,7 @@ def pandas_pa_type(ser: Any) -> np.ndarray:
     # combine_chunks takes the most significant amount of time
     chunk: pa.Array = aa.combine_chunks()
     # When there's null value, we have to use copy
-    zero_copy = chunk.null_count == 0
+    zero_copy = chunk.null_count == 0 and not pa.types.is_boolean(chunk.type)
     # Alternately, we can use chunk.buffers(), which returns a list of buffers and
     # we need to concatenate them ourselves.
     # FIXME(jiamingy): Is there a better way to access the arrow buffer along with
@@ -825,37 +825,9 @@ def _arrow_transform(data: DataType) -> Any:
 
     data = cast(pa.Table, data)
 
-    def type_mapper(dtype: pa.DataType) -> Optional[str]:
-        """Maps pyarrow type to pandas arrow extension type."""
-        if pa.types.is_int8(dtype):
-            return pd.ArrowDtype(pa.int8())
-        if pa.types.is_int16(dtype):
-            return pd.ArrowDtype(pa.int16())
-        if pa.types.is_int32(dtype):
-            return pd.ArrowDtype(pa.int32())
-        if pa.types.is_int64(dtype):
-            return pd.ArrowDtype(pa.int64())
-        if pa.types.is_uint8(dtype):
-            return pd.ArrowDtype(pa.uint8())
-        if pa.types.is_uint16(dtype):
-            return pd.ArrowDtype(pa.uint16())
-        if pa.types.is_uint32(dtype):
-            return pd.ArrowDtype(pa.uint32())
-        if pa.types.is_uint64(dtype):
-            return pd.ArrowDtype(pa.uint64())
-        if pa.types.is_float16(dtype):
-            return pd.ArrowDtype(pa.float16())
-        if pa.types.is_float32(dtype):
-            return pd.ArrowDtype(pa.float32())
-        if pa.types.is_float64(dtype):
-            return pd.ArrowDtype(pa.float64())
-        if pa.types.is_boolean(dtype):
-            return pd.ArrowDtype(pa.bool_())
-        return None
-
     # For common cases, this is zero-copy, can check with:
     # pa.total_allocated_bytes()
-    df = data.to_pandas(types_mapper=type_mapper)
+    df = data.to_pandas(types_mapper=pd.ArrowDtype)
     return df
 
 
diff --git a/python-package/xgboost/sklearn.py b/python-package/xgboost/sklearn.py
index 560a3a8ed285..6c19a6205e7d 100644
--- a/python-package/xgboost/sklearn.py
+++ b/python-package/xgboost/sklearn.py
@@ -517,6 +517,11 @@ def task(i: int) -> float:
                 The value of the gradient for each sample point.
             hess: array_like of shape [n_samples]
                 The value of the second derivative for each sample point
+
+            Note that, if the custom objective produces negative values for
+            the Hessian, these will be clipped. If the objective is non-convex,
+            one might also consider using the expected Hessian (Fisher
+            information) instead.
 """
 
 
diff --git a/python-package/xgboost/testing/__init__.py b/python-package/xgboost/testing/__init__.py
index 482da68c9fc9..e0096c89c9a8 100644
--- a/python-package/xgboost/testing/__init__.py
+++ b/python-package/xgboost/testing/__init__.py
@@ -248,13 +248,14 @@ def as_arrays(
         return X, y, w
 
 
-def make_batches(
+def make_batches(  # pylint: disable=too-many-arguments,too-many-locals
     n_samples_per_batch: int,
     n_features: int,
     n_batches: int,
     use_cupy: bool = False,
     *,
     vary_size: bool = False,
+    random_state: int = 1994,
 ) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]:
     X = []
     y = []
@@ -262,9 +263,9 @@ def make_batches(
     if use_cupy:
         import cupy
 
-        rng = cupy.random.RandomState(1994)
+        rng = cupy.random.RandomState(random_state)
     else:
-        rng = np.random.RandomState(1994)
+        rng = np.random.RandomState(random_state)
     for i in range(n_batches):
         n_samples = n_samples_per_batch + i * 10 if vary_size else n_samples_per_batch
         _X = rng.randn(n_samples, n_features)
diff --git a/python-package/xgboost/testing/data.py b/python-package/xgboost/testing/data.py
index 0c4f290086d1..f4e97e59d363 100644
--- a/python-package/xgboost/testing/data.py
+++ b/python-package/xgboost/testing/data.py
@@ -164,10 +164,6 @@ def pd_arrow_dtypes() -> Generator:
 
     # Integer
     dtypes = pandas_pyarrow_mapper
-    Null: Union[float, None, Any] = np.nan
-    orig = pd.DataFrame(
-        {"f0": [1, 2, Null, 3], "f1": [4, 3, Null, 1]}, dtype=np.float32
-    )
     # Create a dictionary-backed dataframe, enable this when the roundtrip is
     # implemented in pandas/pyarrow
     #
@@ -190,24 +186,33 @@ def pd_arrow_dtypes() -> Generator:
     # pd_catcodes = pd_cat_df["f1"].cat.codes
     # assert pd_catcodes.equals(pa_catcodes)
 
-    for Null in (None, pd.NA):
+    for Null in (None, pd.NA, 0):
         for dtype in dtypes:
             if dtype.startswith("float16") or dtype.startswith("bool"):
                 continue
+            # Use np.nan is a baseline
+            orig_null = Null if not pd.isna(Null) and Null == 0 else np.nan
+            orig = pd.DataFrame(
+                {"f0": [1, 2, orig_null, 3], "f1": [4, 3, orig_null, 1]},
+                dtype=np.float32,
+            )
+
             df = pd.DataFrame(
                 {"f0": [1, 2, Null, 3], "f1": [4, 3, Null, 1]}, dtype=dtype
             )
             yield orig, df
 
-    orig = pd.DataFrame(
-        {"f0": [True, False, pd.NA, True], "f1": [False, True, pd.NA, True]},
-        dtype=pd.BooleanDtype(),
-    )
-    df = pd.DataFrame(
-        {"f0": [True, False, pd.NA, True], "f1": [False, True, pd.NA, True]},
-        dtype=pd.ArrowDtype(pa.bool_()),
-    )
-    yield orig, df
+    # If Null is `False`, then there's no missing value.
+    for Null in (pd.NA, False):
+        orig = pd.DataFrame(
+            {"f0": [True, False, Null, True], "f1": [False, True, Null, True]},
+            dtype=pd.BooleanDtype(),
+        )
+        df = pd.DataFrame(
+            {"f0": [True, False, Null, True], "f1": [False, True, Null, True]},
+            dtype=pd.ArrowDtype(pa.bool_()),
+        )
+        yield orig, df
 
 
 def check_inf(rng: RNG) -> None:
diff --git a/src/collective/tracker.cc b/src/collective/tracker.cc
index 56ec5d546b72..6cb3601db7f4 100644
--- a/src/collective/tracker.cc
+++ b/src/collective/tracker.cc
@@ -111,12 +111,14 @@ RabitTracker::WorkerProxy::WorkerProxy(std::int32_t world, TCPSocket sock, SockA
 }
 
 RabitTracker::RabitTracker(Json const& config) : Tracker{config} {
-  std::string self;
   auto rc = Success() << [&] {
-    return collective::GetHostAddress(&self);
+    host_.clear();
+    host_ = OptionalArg<String>(config, "host", std::string{});
+    if (host_.empty()) {
+      return collective::GetHostAddress(&host_);
+    }
+    return Success();
   } << [&] {
-    host_ = OptionalArg<String>(config, "host", self);
-
     auto addr = MakeSockAddress(xgboost::StringView{host_}, 0);
     listener_ = TCPSocket::Create(addr.IsV4() ? SockDomain::kV4 : SockDomain::kV6);
     return listener_.Bind(host_, &this->port_);
diff --git a/src/common/cuda_pinned_allocator.h b/src/common/cuda_pinned_allocator.h
index d11851d99d37..6fe1757fd369 100644
--- a/src/common/cuda_pinned_allocator.h
+++ b/src/common/cuda_pinned_allocator.h
@@ -61,6 +61,8 @@ class pinned_allocator {
   XGBOOST_DEVICE inline ~pinned_allocator() {} // NOLINT: host/device markup ignored on defaulted functions
   XGBOOST_DEVICE inline pinned_allocator(pinned_allocator const&) {} // NOLINT: host/device markup ignored on defaulted functions
 
+  pinned_allocator& operator=(pinned_allocator const& that) = default;
+  pinned_allocator& operator=(pinned_allocator&& that) = default;
 
   template <typename U>
   XGBOOST_DEVICE inline pinned_allocator(pinned_allocator<U> const&) {} // NOLINT
diff --git a/src/common/hist_util.cc b/src/common/hist_util.cc
index 9b703a3fa13a..dfd80cb68c13 100644
--- a/src/common/hist_util.cc
+++ b/src/common/hist_util.cc
@@ -187,21 +187,20 @@ class GHistBuildingManager {
 };
 
 template <bool do_prefetch, class BuildingManager>
-void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,
-                             const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
-                             GHistRow hist) {
+void RowsWiseBuildHistKernel(Span<GradientPair const> gpair, Span<bst_idx_t const> row_indices,
+                             const GHistIndexMatrix &gmat, GHistRow hist) {
   constexpr bool kAnyMissing = BuildingManager::kAnyMissing;
   constexpr bool kFirstPage = BuildingManager::kFirstPage;
   using BinIdxType = typename BuildingManager::BinIdxType;
 
-  const size_t size = row_indices.Size();
-  const size_t *rid = row_indices.begin;
+  const size_t size = row_indices.size();
+  bst_idx_t const *rid = row_indices.data();
   auto const *p_gpair = reinterpret_cast<const float *>(gpair.data());
   const BinIdxType *gradient_index = gmat.index.data<BinIdxType>();
 
   auto const &row_ptr = gmat.row_ptr.data();
   auto base_rowid = gmat.base_rowid;
-  uint32_t const *offsets = gmat.index.Offset();
+  std::uint32_t const *offsets = gmat.index.Offset();
   // There's no feature-based compression if missing value is present.
   if (kAnyMissing) {
     CHECK(!offsets);
@@ -212,10 +211,13 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,
   auto get_row_ptr = [&](bst_idx_t ridx) {
     return kFirstPage ? row_ptr[ridx] : row_ptr[ridx - base_rowid];
   };
-  auto get_rid = [&](bst_idx_t ridx) { return kFirstPage ? ridx : (ridx - base_rowid); };
+  auto get_rid = [&](bst_idx_t ridx) {
+    return kFirstPage ? ridx : (ridx - base_rowid);
+  };
 
+  CHECK_NE(row_indices.size(), 0);
   const size_t n_features =
-      get_row_ptr(row_indices.begin[0] + 1) - get_row_ptr(row_indices.begin[0]);
+      get_row_ptr(row_indices.data()[0] + 1) - get_row_ptr(row_indices.data()[0]);
   auto hist_data = reinterpret_cast<double *>(hist.data());
   const uint32_t two{2};  // Each element from 'gpair' and 'hist' contains
                           // 2 FP values: gradient and hessian.
@@ -261,14 +263,13 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair,
 }
 
 template <class BuildingManager>
-void ColsWiseBuildHistKernel(Span<GradientPair const> gpair,
-                             const RowSetCollection::Elem row_indices, const GHistIndexMatrix &gmat,
-                             GHistRow hist) {
+void ColsWiseBuildHistKernel(Span<GradientPair const> gpair, Span<bst_idx_t const> row_indices,
+                             const GHistIndexMatrix &gmat, GHistRow hist) {
   constexpr bool kAnyMissing = BuildingManager::kAnyMissing;
   constexpr bool kFirstPage = BuildingManager::kFirstPage;
   using BinIdxType = typename BuildingManager::BinIdxType;
-  const size_t size = row_indices.Size();
-  const size_t *rid = row_indices.begin;
+  const size_t size = row_indices.size();
+  bst_idx_t const *rid = row_indices.data();
   auto const *pgh = reinterpret_cast<const float *>(gpair.data());
   const BinIdxType *gradient_index = gmat.index.data<BinIdxType>();
 
@@ -312,35 +313,39 @@ void ColsWiseBuildHistKernel(Span<GradientPair const> gpair,
 }
 
 template <class BuildingManager>
-void BuildHistDispatch(Span<GradientPair const> gpair, const RowSetCollection::Elem row_indices,
+void BuildHistDispatch(Span<GradientPair const> gpair, Span<bst_idx_t const> row_indices,
                        const GHistIndexMatrix &gmat, GHistRow hist) {
   if (BuildingManager::kReadByColumn) {
     ColsWiseBuildHistKernel<BuildingManager>(gpair, row_indices, gmat, hist);
   } else {
-    const size_t nrows = row_indices.Size();
+    const size_t nrows = row_indices.size();
     const size_t no_prefetch_size = Prefetch::NoPrefetchSize(nrows);
     // if need to work with all rows from bin-matrix (e.g. root node)
     const bool contiguousBlock =
-        (row_indices.begin[nrows - 1] - row_indices.begin[0]) == (nrows - 1);
+        (row_indices.begin()[nrows - 1] - row_indices.begin()[0]) == (nrows - 1);
 
     if (contiguousBlock) {
+      if (row_indices.empty()) {
+        return;
+      }
       // contiguous memory access, built-in HW prefetching is enough
       RowsWiseBuildHistKernel<false, BuildingManager>(gpair, row_indices, gmat, hist);
     } else {
-      const RowSetCollection::Elem span1(row_indices.begin,
-                                        row_indices.end - no_prefetch_size);
-      const RowSetCollection::Elem span2(row_indices.end - no_prefetch_size,
-                                        row_indices.end);
-
-      RowsWiseBuildHistKernel<true, BuildingManager>(gpair, span1, gmat, hist);
+      auto span1 = row_indices.subspan(0, row_indices.size() - no_prefetch_size);
+      if (!span1.empty()) {
+        RowsWiseBuildHistKernel<true, BuildingManager>(gpair, span1, gmat, hist);
+      }
       // no prefetching to avoid loading extra memory
-      RowsWiseBuildHistKernel<false, BuildingManager>(gpair, span2, gmat, hist);
+      auto span2 = row_indices.subspan(row_indices.size() - no_prefetch_size);
+      if (!span2.empty()) {
+        RowsWiseBuildHistKernel<false, BuildingManager>(gpair, span2, gmat, hist);
+      }
     }
   }
 }
 
 template <bool any_missing>
-void BuildHist(Span<GradientPair const> gpair, const RowSetCollection::Elem row_indices,
+void BuildHist(Span<GradientPair const> gpair, Span<bst_idx_t const> row_indices,
                const GHistIndexMatrix &gmat, GHistRow hist, bool force_read_by_column) {
   /* force_read_by_column is used for testing the columnwise building of histograms.
    * default force_read_by_column = false
@@ -358,13 +363,11 @@ void BuildHist(Span<GradientPair const> gpair, const RowSetCollection::Elem row_
       });
 }
 
-template void BuildHist<true>(Span<GradientPair const> gpair,
-                              const RowSetCollection::Elem row_indices,
+template void BuildHist<true>(Span<GradientPair const> gpair, Span<bst_idx_t const> row_indices,
                               const GHistIndexMatrix &gmat, GHistRow hist,
                               bool force_read_by_column);
 
-template void BuildHist<false>(Span<GradientPair const> gpair,
-                               const RowSetCollection::Elem row_indices,
+template void BuildHist<false>(Span<GradientPair const> gpair, Span<bst_idx_t const> row_indices,
                                const GHistIndexMatrix &gmat, GHistRow hist,
                                bool force_read_by_column);
 }  // namespace xgboost::common
diff --git a/src/common/hist_util.h b/src/common/hist_util.h
index 2e24f68ffb0a..559093bb5a3f 100644
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -635,7 +635,7 @@ class ParallelGHistBuilder {
 
 // construct a histogram via histogram aggregation
 template <bool any_missing>
-void BuildHist(Span<GradientPair const> gpair, const RowSetCollection::Elem row_indices,
+void BuildHist(Span<GradientPair const> gpair, Span<bst_idx_t const> row_indices,
                const GHistIndexMatrix& gmat, GHistRow hist, bool force_read_by_column = false);
 }  // namespace common
 }  // namespace xgboost
diff --git a/src/common/partition_builder.h b/src/common/partition_builder.h
index 53e11bd91d1a..98c876e849a0 100644
--- a/src/common/partition_builder.h
+++ b/src/common/partition_builder.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023 by Contributors
+ * Copyright 2021-2024, XGBoost Contributors
  * \file row_set.h
  * \brief Quick Utility to compute subset of rows
  * \author Philip Cho, Tianqi Chen
@@ -16,7 +16,6 @@
 #include <utility>
 #include <vector>
 
-#include "../tree/hist/expand_entry.h"
 #include "categorical.h"
 #include "column_matrix.h"
 #include "xgboost/context.h"
@@ -54,23 +53,23 @@ class PartitionBuilder {
   // Handle dense columns
   // Analog of std::stable_partition, but in no-inplace manner
   template <bool default_left, bool any_missing, typename ColumnType, typename Predicate>
-  inline std::pair<size_t, size_t> PartitionKernel(ColumnType* p_column,
-                                                   common::Span<const size_t> row_indices,
-                                                   common::Span<size_t> left_part,
-                                                   common::Span<size_t> right_part,
-                                                   size_t base_rowid, Predicate&& pred) {
+  std::pair<size_t, size_t> PartitionKernel(ColumnType* p_column,
+                                            common::Span<const bst_idx_t> row_indices,
+                                            common::Span<bst_idx_t> left_part,
+                                            common::Span<bst_idx_t> right_part,
+                                            bst_idx_t base_rowid, Predicate&& pred) {
     auto& column = *p_column;
-    size_t* p_left_part = left_part.data();
-    size_t* p_right_part = right_part.data();
-    size_t nleft_elems = 0;
-    size_t nright_elems = 0;
+    bst_idx_t* p_left_part = left_part.data();
+    bst_idx_t* p_right_part = right_part.data();
+    bst_idx_t nleft_elems = 0;
+    bst_idx_t nright_elems = 0;
 
     auto p_row_indices = row_indices.data();
     auto n_samples = row_indices.size();
 
     for (size_t i = 0; i < n_samples; ++i) {
       auto rid = p_row_indices[i];
-      const int32_t bin_id = column[rid - base_rowid];
+      bst_bin_t const bin_id = column[rid - base_rowid];
       if (any_missing && bin_id == ColumnType::kMissingId) {
         if (default_left) {
           p_left_part[nleft_elems++] = rid;
@@ -90,14 +89,14 @@ class PartitionBuilder {
   }
 
   template <typename Pred>
-  inline std::pair<size_t, size_t> PartitionRangeKernel(common::Span<const size_t> ridx,
-                                                        common::Span<size_t> left_part,
-                                                        common::Span<size_t> right_part,
+  inline std::pair<size_t, size_t> PartitionRangeKernel(common::Span<const bst_idx_t> ridx,
+                                                        common::Span<bst_idx_t> left_part,
+                                                        common::Span<bst_idx_t> right_part,
                                                         Pred pred) {
-    size_t* p_left_part = left_part.data();
-    size_t* p_right_part = right_part.data();
-    size_t nleft_elems = 0;
-    size_t nright_elems = 0;
+    bst_idx_t* p_left_part = left_part.data();
+    bst_idx_t* p_right_part = right_part.data();
+    bst_idx_t nleft_elems = 0;
+    bst_idx_t nright_elems = 0;
     for (auto row_id : ridx) {
       if (pred(row_id)) {
         p_left_part[nleft_elems++] = row_id;
@@ -112,10 +111,10 @@ class PartitionBuilder {
   void Partition(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
                  const common::Range1d range, const bst_bin_t split_cond,
                  GHistIndexMatrix const& gmat, const common::ColumnMatrix& column_matrix,
-                 const RegTree& tree, const size_t* rid) {
-    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
-    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
-    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
+                 const RegTree& tree, bst_idx_t const* rid) {
+    common::Span<bst_idx_t const> rid_span{rid + range.begin(), rid + range.end()};
+    common::Span<bst_idx_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
+    common::Span<bst_idx_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
     std::size_t nid = nodes[node_in_set].nid;
     bst_feature_t fid = tree.SplitIndex(nid);
     bool default_left = tree.DefaultLeft(nid);
@@ -184,8 +183,9 @@ class PartitionBuilder {
   }
 
   template <bool any_missing, typename ColumnType, typename Predicate>
-  void MaskKernel(ColumnType* p_column, common::Span<const size_t> row_indices, size_t base_rowid,
-                  BitVector* decision_bits, BitVector* missing_bits, Predicate&& pred) {
+  void MaskKernel(ColumnType* p_column, common::Span<bst_idx_t const> row_indices,
+                  bst_idx_t base_rowid, BitVector* decision_bits, BitVector* missing_bits,
+                  Predicate&& pred) {
     auto& column = *p_column;
     for (auto const row_id : row_indices) {
       auto const bin_id = column[row_id - base_rowid];
@@ -205,9 +205,9 @@ class PartitionBuilder {
   template <typename BinIdxType, bool any_missing, bool any_cat, typename ExpandEntry>
   void MaskRows(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
                 const common::Range1d range, bst_bin_t split_cond, GHistIndexMatrix const& gmat,
-                const common::ColumnMatrix& column_matrix, const RegTree& tree, const size_t* rid,
-                BitVector* decision_bits, BitVector* missing_bits) {
-    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
+                const common::ColumnMatrix& column_matrix, const RegTree& tree,
+                bst_idx_t const* rid, BitVector* decision_bits, BitVector* missing_bits) {
+    common::Span<bst_idx_t const> rid_span{rid + range.begin(), rid + range.end()};
     std::size_t nid = nodes[node_in_set].nid;
     bst_feature_t fid = tree.SplitIndex(nid);
     bool is_cat = tree.GetSplitTypes()[nid] == FeatureType::kCategorical;
@@ -263,11 +263,11 @@ class PartitionBuilder {
   template <typename ExpandEntry>
   void PartitionByMask(const size_t node_in_set, std::vector<ExpandEntry> const& nodes,
                        const common::Range1d range, GHistIndexMatrix const& gmat,
-                       const RegTree& tree, const size_t* rid, BitVector const& decision_bits,
+                       const RegTree& tree, bst_idx_t const* rid, BitVector const& decision_bits,
                        BitVector const& missing_bits) {
-    common::Span<const size_t> rid_span(rid + range.begin(), rid + range.end());
-    common::Span<size_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
-    common::Span<size_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
+    common::Span<bst_idx_t const> rid_span(rid + range.begin(), rid + range.end());
+    common::Span<bst_idx_t> left = GetLeftBuffer(node_in_set, range.begin(), range.end());
+    common::Span<bst_idx_t> right = GetRightBuffer(node_in_set, range.begin(), range.end());
     std::size_t nid = nodes[node_in_set].nid;
     bool default_left = tree.DefaultLeft(nid);
 
@@ -299,12 +299,12 @@ class PartitionBuilder {
     }
   }
 
-  common::Span<size_t> GetLeftBuffer(int nid, size_t begin, size_t end) {
+  common::Span<bst_idx_t> GetLeftBuffer(int nid, size_t begin, size_t end) {
     const size_t task_idx = GetTaskIdx(nid, begin);
     return { mem_blocks_.at(task_idx)->Left(), end - begin };
   }
 
-  common::Span<size_t> GetRightBuffer(int nid, size_t begin, size_t end) {
+  common::Span<bst_idx_t> GetRightBuffer(int nid, size_t begin, size_t end) {
     const size_t task_idx = GetTaskIdx(nid, begin);
     return { mem_blocks_.at(task_idx)->Right(), end - begin };
   }
@@ -346,14 +346,14 @@ class PartitionBuilder {
     }
   }
 
-  void MergeToArray(int nid, size_t begin, size_t* rows_indexes) {
+  void MergeToArray(bst_node_t nid, size_t begin, bst_idx_t* rows_indexes) {
     size_t task_idx = GetTaskIdx(nid, begin);
 
-    size_t* left_result  = rows_indexes + mem_blocks_[task_idx]->n_offset_left;
-    size_t* right_result = rows_indexes + mem_blocks_[task_idx]->n_offset_right;
+    bst_idx_t* left_result = rows_indexes + mem_blocks_[task_idx]->n_offset_left;
+    bst_idx_t* right_result = rows_indexes + mem_blocks_[task_idx]->n_offset_right;
 
-    const size_t* left = mem_blocks_[task_idx]->Left();
-    const size_t* right = mem_blocks_[task_idx]->Right();
+    bst_idx_t const* left = mem_blocks_[task_idx]->Left();
+    bst_idx_t const* right = mem_blocks_[task_idx]->Right();
 
     std::copy_n(left, mem_blocks_[task_idx]->n_left, left_result);
     std::copy_n(right, mem_blocks_[task_idx]->n_right, right_result);
@@ -377,10 +377,10 @@ class PartitionBuilder {
         return;
       }
       CHECK(tree.IsLeaf(node.node_id));
-      if (node.begin) {  // guard for empty node.
-        size_t ptr_offset = node.end - p_begin;
+      if (node.begin()) {  // guard for empty node.
+        size_t ptr_offset = node.end() - p_begin;
         CHECK_LE(ptr_offset, row_set.Data()->size()) << node.node_id;
-        for (auto idx = node.begin; idx != node.end; ++idx) {
+        for (auto idx = node.begin(); idx != node.end(); ++idx) {
           h_pos[*idx] = sampledp(*idx) ? ~node.node_id : node.node_id;
         }
       }
@@ -395,16 +395,16 @@ class PartitionBuilder {
     size_t n_offset_left;
     size_t n_offset_right;
 
-    size_t* Left() {
+    bst_idx_t* Left() {
       return &left_data_[0];
     }
 
-    size_t* Right() {
+    bst_idx_t* Right() {
       return &right_data_[0];
     }
    private:
-    size_t left_data_[BlockSize];
-    size_t right_data_[BlockSize];
+    bst_idx_t left_data_[BlockSize];
+    bst_idx_t right_data_[BlockSize];
   };
   std::vector<std::pair<size_t, size_t>> left_right_nodes_sizes_;
   std::vector<size_t> blocks_offsets_;
diff --git a/src/common/random.h b/src/common/random.h
index 6d7a1bb499c9..3aed3384a1f6 100644
--- a/src/common/random.h
+++ b/src/common/random.h
@@ -179,14 +179,14 @@ class ColumnSampler {
 
     feature_set_tree_->SetDevice(ctx->Device());
     feature_set_tree_->Resize(num_col);
-    if (ctx->IsCPU()) {
-      std::iota(feature_set_tree_->HostVector().begin(), feature_set_tree_->HostVector().end(), 0);
-    } else {
+    if (ctx->IsCUDA()) {
 #if defined(XGBOOST_USE_CUDA)
       cuda_impl::InitFeatureSet(ctx, feature_set_tree_);
 #else
       AssertGPUSupport();
 #endif
+    } else {
+      std::iota(feature_set_tree_->HostVector().begin(), feature_set_tree_->HostVector().end(), 0);
     }
 
     feature_set_tree_ = ColSample(feature_set_tree_, colsample_bytree_);
diff --git a/src/common/row_set.h b/src/common/row_set.h
index acb39730539b..8df2a7a36839 100644
--- a/src/common/row_set.h
+++ b/src/common/row_set.h
@@ -31,15 +31,29 @@ class RowSetCollection {
    *        associated with a particular node in a decision tree.
    */
   struct Elem {
-    std::size_t const* begin{nullptr};
-    std::size_t const* end{nullptr};
+   private:
+    bst_idx_t* begin_{nullptr};
+    bst_idx_t* end_{nullptr};
+
+   public:
     bst_node_t node_id{-1};
     // id of node associated with this instance set; -1 means uninitialized
     Elem() = default;
-    Elem(std::size_t const* begin, std::size_t const* end, bst_node_t node_id = -1)
-        : begin(begin), end(end), node_id(node_id) {}
+    Elem(bst_idx_t* begin, bst_idx_t* end, bst_node_t node_id = -1)
+        : begin_(begin), end_(end), node_id(node_id) {}
+
+    // Disable copy ctor to avoid casting away the constness via copy.
+    Elem(Elem const& that) = delete;
+    Elem& operator=(Elem const& that) = delete;
+    Elem(Elem&& that) = default;
+    Elem& operator=(Elem&& that) = default;
 
-    std::size_t Size() const { return end - begin; }
+    [[nodiscard]] std::size_t Size() const { return std::distance(begin(), end()); }
+
+    [[nodiscard]] bst_idx_t const* begin() const { return this->begin_; }  // NOLINT
+    [[nodiscard]] bst_idx_t const* end() const { return this->end_; }      // NOLINT
+    [[nodiscard]] bst_idx_t* begin() { return this->begin_; }              // NOLINT
+    [[nodiscard]] bst_idx_t* end() { return this->end_; }                  // NOLINT
   };
 
   [[nodiscard]] std::vector<Elem>::const_iterator begin() const {  // NOLINT
@@ -71,55 +85,57 @@ class RowSetCollection {
     CHECK(elem_of_each_node_.empty());
 
     if (row_indices_.empty()) {  // edge case: empty instance set
-      constexpr std::size_t* kBegin = nullptr;
-      constexpr std::size_t* kEnd = nullptr;
+      constexpr bst_idx_t* kBegin = nullptr;
+      constexpr bst_idx_t* kEnd = nullptr;
       static_assert(kEnd - kBegin == 0);
       elem_of_each_node_.emplace_back(kBegin, kEnd, 0);
       return;
     }
 
-    const std::size_t* begin = dmlc::BeginPtr(row_indices_);
-    const std::size_t* end = dmlc::BeginPtr(row_indices_) + row_indices_.size();
+    bst_idx_t* begin = row_indices_.data();
+    bst_idx_t* end = row_indices_.data() + row_indices_.size();
     elem_of_each_node_.emplace_back(begin, end, 0);
   }
 
-  [[nodiscard]] std::vector<std::size_t>* Data() { return &row_indices_; }
-  [[nodiscard]] std::vector<std::size_t> const* Data() const { return &row_indices_; }
+  [[nodiscard]] std::vector<bst_idx_t>* Data() { return &row_indices_; }
+  [[nodiscard]] std::vector<bst_idx_t> const* Data() const { return &row_indices_; }
 
   // split rowset into two
   void AddSplit(bst_node_t node_id, bst_node_t left_node_id, bst_node_t right_node_id,
                 bst_idx_t n_left, bst_idx_t n_right) {
-    const Elem e = elem_of_each_node_[node_id];
+    Elem& e = elem_of_each_node_[node_id];
 
-    std::size_t* all_begin{nullptr};
-    std::size_t* begin{nullptr};
-    if (e.begin == nullptr) {
+    bst_idx_t* all_begin{nullptr};
+    bst_idx_t* begin{nullptr};
+    bst_idx_t* end{nullptr};
+    if (e.begin() == nullptr) {
       CHECK_EQ(n_left, 0);
       CHECK_EQ(n_right, 0);
     } else {
       all_begin = row_indices_.data();
-      begin = all_begin + (e.begin - all_begin);
+      begin = all_begin + (e.begin() - all_begin);
+      end = elem_of_each_node_[node_id].end();
     }
 
     CHECK_EQ(n_left + n_right, e.Size());
-    CHECK_LE(begin + n_left, e.end);
-    CHECK_EQ(begin + n_left + n_right, e.end);
+    CHECK_LE(begin + n_left, e.end());
+    CHECK_EQ(begin + n_left + n_right, e.end());
 
     if (left_node_id >= static_cast<bst_node_t>(elem_of_each_node_.size())) {
-      elem_of_each_node_.resize(left_node_id + 1, Elem{nullptr, nullptr, -1});
+      elem_of_each_node_.resize(left_node_id + 1);
     }
     if (right_node_id >= static_cast<bst_node_t>(elem_of_each_node_.size())) {
-      elem_of_each_node_.resize(right_node_id + 1, Elem{nullptr, nullptr, -1});
+      elem_of_each_node_.resize(right_node_id + 1);
     }
 
     elem_of_each_node_[left_node_id] = Elem{begin, begin + n_left, left_node_id};
-    elem_of_each_node_[right_node_id] = Elem{begin + n_left, e.end, right_node_id};
+    elem_of_each_node_[right_node_id] = Elem{begin + n_left, end, right_node_id};
     elem_of_each_node_[node_id] = Elem{nullptr, nullptr, -1};
   }
 
  private:
   // stores the row indexes in the set
-  std::vector<std::size_t> row_indices_;
+  std::vector<bst_idx_t> row_indices_;
   // vector: node_id -> elements
   std::vector<Elem> elem_of_each_node_;
 };
diff --git a/src/common/stats.cc b/src/common/stats.cc
index bbf969fcc4d8..72c917bedee1 100644
--- a/src/common/stats.cc
+++ b/src/common/stats.cc
@@ -18,7 +18,7 @@
 namespace xgboost::common {
 void Median(Context const* ctx, linalg::Tensor<float, 2> const& t,
             HostDeviceVector<float> const& weights, linalg::Tensor<float, 1>* out) {
-  if (!ctx->IsCPU()) {
+  if (ctx->IsCUDA()) {
     weights.SetDevice(ctx->Device());
     auto opt_weights = OptionalWeights(weights.ConstDeviceSpan());
     auto t_v = t.View(ctx->Device());
diff --git a/src/common/threadpool.h b/src/common/threadpool.h
index 95d1deaaabc3..21e27aa760a1 100644
--- a/src/common/threadpool.h
+++ b/src/common/threadpool.h
@@ -26,20 +26,25 @@ class ThreadPool {
   bool stop_{false};
 
  public:
-  explicit ThreadPool(std::int32_t n_threads) {
+  /**
+   * @param n_threads The number of threads this pool should hold.
+   * @param init_fn   Function called once during thread creation.
+   */
+  template <typename InitFn>
+  explicit ThreadPool(std::int32_t n_threads, InitFn&& init_fn) {
     for (std::int32_t i = 0; i < n_threads; ++i) {
-      pool_.emplace_back([&] {
+      pool_.emplace_back([&, init_fn = std::forward<InitFn>(init_fn)] {
+        init_fn();
+
         while (true) {
           std::unique_lock lock{mu_};
           cv_.wait(lock, [this] { return !this->tasks_.empty() || stop_; });
 
           if (this->stop_) {
-            if (!tasks_.empty()) {
-              while (!tasks_.empty()) {
-                auto fn = tasks_.front();
-                tasks_.pop();
-                fn();
-              }
+            while (!tasks_.empty()) {
+              auto fn = tasks_.front();
+              tasks_.pop();
+              fn();
             }
             return;
           }
@@ -81,8 +86,13 @@ class ThreadPool {
     // Use shared ptr to make the task copy constructible.
     auto p{std::make_shared<std::promise<R>>()};
     auto fut = p->get_future();
-    auto ffn = std::function{[task = std::move(p), fn = std::move(fn)]() mutable {
-      task->set_value(fn());
+    auto ffn = std::function{[task = std::move(p), fn = std::forward<Fn>(fn)]() mutable {
+      if constexpr (std::is_void_v<R>) {
+        fn();
+        task->set_value();
+      } else {
+        task->set_value(fn());
+      }
     }};
 
     std::unique_lock lock{mu_};
diff --git a/src/data/ellpack_page.cuh b/src/data/ellpack_page.cuh
index 04960458f277..d1f9472df4c4 100644
--- a/src/data/ellpack_page.cuh
+++ b/src/data/ellpack_page.cuh
@@ -45,17 +45,17 @@ struct EllpackDeviceAccessor {
         n_rows(n_rows),
         gidx_iter(gidx_iter),
         feature_types{feature_types} {
-    if (device.IsCPU()) {
-      gidx_fvalue_map = cuts->cut_values_.ConstHostSpan();
-      feature_segments = cuts->cut_ptrs_.ConstHostSpan();
-      min_fvalue = cuts->min_vals_.ConstHostSpan();
-    } else {
+    if (device.IsCUDA()) {
       cuts->cut_values_.SetDevice(device);
       cuts->cut_ptrs_.SetDevice(device);
       cuts->min_vals_.SetDevice(device);
       gidx_fvalue_map = cuts->cut_values_.ConstDeviceSpan();
       feature_segments = cuts->cut_ptrs_.ConstDeviceSpan();
       min_fvalue = cuts->min_vals_.ConstDeviceSpan();
+    } else {
+      gidx_fvalue_map = cuts->cut_values_.ConstHostSpan();
+      feature_segments = cuts->cut_ptrs_.ConstHostSpan();
+      min_fvalue = cuts->min_vals_.ConstHostSpan();
     }
   }
   // Get a matrix element, uses binary search for look up Return NaN if missing
diff --git a/src/data/gradient_index_page_source.cc b/src/data/gradient_index_page_source.cc
index f1ceb282a109..0fee1c9fb4b0 100644
--- a/src/data/gradient_index_page_source.cc
+++ b/src/data/gradient_index_page_source.cc
@@ -9,6 +9,9 @@ void GradientIndexPageSource::Fetch() {
     if (count_ != 0 && !sync_) {
       // source is initialized to be the 0th page during construction, so when count_ is 0
       // there's no need to increment the source.
+      //
+      // The mixin doesn't sync the source if `sync_` is false, we need to sync it
+      // ourselves.
       ++(*source_);
     }
     // This is not read from cache so we still need it to be synced with sparse page source.
diff --git a/src/data/iterative_dmatrix.cc b/src/data/iterative_dmatrix.cc
index e581e90ca40b..368aeb2ac2fb 100644
--- a/src/data/iterative_dmatrix.cc
+++ b/src/data/iterative_dmatrix.cc
@@ -41,10 +41,10 @@ IterativeDMatrix::IterativeDMatrix(DataIterHandle iter_handle, DMatrixHandle pro
   // hardcoded parameter.
   BatchParam p{max_bin, tree::TrainParam::DftSparseThreshold()};
 
-  if (ctx.IsCPU()) {
-    this->InitFromCPU(&ctx, p, iter_handle, missing, ref);
-  } else {
+  if (ctx.IsCUDA()) {
     this->InitFromCUDA(&ctx, p, iter_handle, missing, ref);
+  } else {
+    this->InitFromCPU(&ctx, p, iter_handle, missing, ref);
   }
 
   this->fmat_ctx_ = ctx;
@@ -73,10 +73,10 @@ void GetCutsFromRef(Context const* ctx, std::shared_ptr<DMatrix> ref, bst_featur
 
   if (ref->PageExists<GHistIndexMatrix>() && ref->PageExists<EllpackPage>()) {
     // Both exists
-    if (ctx->IsCPU()) {
-      csr();
-    } else {
+    if (ctx->IsCUDA()) {
       ellpack();
+    } else {
+      csr();
     }
   } else if (ref->PageExists<GHistIndexMatrix>()) {
     csr();
@@ -84,10 +84,10 @@ void GetCutsFromRef(Context const* ctx, std::shared_ptr<DMatrix> ref, bst_featur
     ellpack();
   } else {
     // None exist
-    if (ctx->IsCPU()) {
-      csr();
-    } else {
+    if (ctx->IsCUDA()) {
       ellpack();
+    } else {
+      csr();
     }
   }
   CHECK_EQ(ref->Info().num_col_, n_features)
@@ -297,9 +297,9 @@ BatchSet<GHistIndexMatrix> IterativeDMatrix::GetGradientIndex(Context const* ctx
   }
 
   if (!ghist_) {
-    if (ctx->IsCPU()) {
+    if (!ctx->IsCUDA()) {
       ghist_ = std::make_shared<GHistIndexMatrix>(ctx, Info(), *ellpack_, param);
-    } else if (fmat_ctx_.IsCPU()) {
+    } else if (!fmat_ctx_.IsCUDA()) {
       ghist_ = std::make_shared<GHistIndexMatrix>(&fmat_ctx_, Info(), *ellpack_, param);
     } else {
       // Can happen when QDM is initialized on GPU, but a CPU version is queried by a different QDM
diff --git a/src/data/iterative_dmatrix.cu b/src/data/iterative_dmatrix.cu
index 868875bf7d4a..2e8da2c7e7ed 100644
--- a/src/data/iterative_dmatrix.cu
+++ b/src/data/iterative_dmatrix.cu
@@ -46,7 +46,7 @@ void IterativeDMatrix::InitFromCUDA(Context const* ctx, BatchParam const& p,
   int32_t current_device;
   dh::safe_cuda(cudaGetDevice(&current_device));
   auto get_device = [&]() {
-    auto d = (ctx->IsCPU()) ? DeviceOrd::CUDA(current_device) : ctx->Device();
+    auto d = (ctx->IsCUDA()) ? ctx->Device() : DeviceOrd::CUDA(current_device);
     CHECK(!d.IsCPU());
     return d;
   };
diff --git a/src/data/proxy_dmatrix.cc b/src/data/proxy_dmatrix.cc
index a28448e3b045..bcefb4999c72 100644
--- a/src/data/proxy_dmatrix.cc
+++ b/src/data/proxy_dmatrix.cc
@@ -56,7 +56,9 @@ std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
                                                 float missing) {
   bool type_error{false};
   std::shared_ptr<DMatrix> p_fmat{nullptr};
-  if (proxy->Ctx()->IsCPU()) {
+  if (proxy->Ctx()->IsCUDA()) {
+    p_fmat = cuda_impl::CreateDMatrixFromProxy(ctx, proxy, missing);
+  } else {
     p_fmat = data::HostAdapterDispatch<false>(
         proxy.get(),
         [&](auto const &adapter) {
@@ -65,8 +67,6 @@ std::shared_ptr<DMatrix> CreateDMatrixFromProxy(Context const *ctx,
           return p_fmat;
         },
         &type_error);
-  } else {
-    p_fmat = cuda_impl::CreateDMatrixFromProxy(ctx, proxy, missing);
   }
 
   CHECK(p_fmat) << "Failed to fallback.";
diff --git a/src/data/proxy_dmatrix.cu b/src/data/proxy_dmatrix.cu
index cd76e49cf205..fb484f5e31d1 100644
--- a/src/data/proxy_dmatrix.cu
+++ b/src/data/proxy_dmatrix.cu
@@ -11,7 +11,7 @@ void DMatrixProxy::FromCudaColumnar(StringView interface_str) {
   this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
-  if (adapter->Device().IsCPU()) {
+  if (!adapter->Device().IsCUDA()) {
     // empty data
     CHECK_EQ(this->Info().num_row_, 0);
     ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
@@ -25,7 +25,7 @@ void DMatrixProxy::FromCudaArray(StringView interface_str) {
   this->batch_ = adapter;
   this->Info().num_col_ = adapter->NumColumns();
   this->Info().num_row_ = adapter->NumRows();
-  if (adapter->Device().IsCPU()) {
+  if (!adapter->Device().IsCUDA()) {
     // empty data
     CHECK_EQ(this->Info().num_row_, 0);
     ctx_ = ctx_.MakeCUDA(dh::CurrentDevice());
diff --git a/src/data/simple_dmatrix.cc b/src/data/simple_dmatrix.cc
index f54d1c43eda4..e4b82b7de59f 100644
--- a/src/data/simple_dmatrix.cc
+++ b/src/data/simple_dmatrix.cc
@@ -185,12 +185,12 @@ BatchSet<GHistIndexMatrix> SimpleDMatrix::GetGradientIndex(Context const* ctx,
     CHECK_GE(param.max_bin, 2);
     // Used only by approx.
     auto sorted_sketch = param.regen;
-    if (ctx->IsCPU()) {
+    if (!ctx->IsCUDA()) {
       // The context passed in is on CPU, we pick it first since we prioritize the context
       // in Booster.
       gradient_index_.reset(new GHistIndexMatrix{ctx, this, param.max_bin, param.sparse_thresh,
                                                  sorted_sketch, param.hess});
-    } else if (fmat_ctx_.IsCPU()) {
+    } else if (!fmat_ctx_.IsCUDA()) {
       // DMatrix was initialized on CPU, we use the context from initialization.
       gradient_index_.reset(new GHistIndexMatrix{&fmat_ctx_, this, param.max_bin,
                                                  param.sparse_thresh, sorted_sketch, param.hess});
diff --git a/src/data/simple_dmatrix.cu b/src/data/simple_dmatrix.cu
index e5b4d18f77db..c177784a36a4 100644
--- a/src/data/simple_dmatrix.cu
+++ b/src/data/simple_dmatrix.cu
@@ -19,7 +19,7 @@ SimpleDMatrix::SimpleDMatrix(AdapterT* adapter, float missing, std::int32_t nthr
                              DataSplitMode data_split_mode) {
   CHECK(data_split_mode != DataSplitMode::kCol)
       << "Column-wise data split is currently not supported on the GPU.";
-  auto device = (adapter->Device().IsCPU() || adapter->NumRows() == 0)
+  auto device = (!adapter->Device().IsCUDA() || adapter->NumRows() == 0)
                     ? DeviceOrd::CUDA(dh::CurrentDevice())
                     : adapter->Device();
   CHECK(device.IsCUDA());
diff --git a/src/data/sparse_page_source.cu b/src/data/sparse_page_source.cu
index 40037eedc0f5..99032aeaad7d 100644
--- a/src/data/sparse_page_source.cu
+++ b/src/data/sparse_page_source.cu
@@ -20,7 +20,7 @@ std::size_t NFeaturesDevice(DMatrixProxy *proxy) {
 
 void DevicePush(DMatrixProxy *proxy, float missing, SparsePage *page) {
   auto device = proxy->Device();
-  if (device.IsCPU()) {
+  if (!device.IsCUDA()) {
     device = DeviceOrd::CUDA(dh::CurrentDevice());
   }
   CHECK(device.IsCUDA());
diff --git a/src/data/sparse_page_source.h b/src/data/sparse_page_source.h
index 89aa86ace614..550631b72dc5 100644
--- a/src/data/sparse_page_source.h
+++ b/src/data/sparse_page_source.h
@@ -42,7 +42,7 @@ struct Cache {
   std::string name;
   std::string format;
   // offset into binary cache file.
-  std::vector<std::uint64_t> offset;
+  std::vector<bst_idx_t> offset;
 
   Cache(bool w, std::string n, std::string fmt, bool on_host)
       : written{w}, on_host{on_host}, name{std::move(n)}, format{std::move(fmt)} {
@@ -61,7 +61,7 @@ struct Cache {
   /**
    * @brief Record a page with size of n_bytes.
    */
-  void Push(std::size_t n_bytes) { offset.push_back(n_bytes); }
+  void Push(bst_idx_t n_bytes) { offset.push_back(n_bytes); }
   /**
    * @brief Returns the view start and length for the i^th page.
    */
@@ -73,7 +73,7 @@ struct Cache {
   /**
    * @brief Get the number of bytes for the i^th page.
    */
-  [[nodiscard]] std::uint64_t Bytes(std::size_t i) const { return offset.at(i + 1) - offset[i]; }
+  [[nodiscard]] bst_idx_t Bytes(std::size_t i) const { return offset.at(i + 1) - offset[i]; }
   /**
    * @brief Call this once the write for the cache is complete.
    */
@@ -218,7 +218,6 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
   common::Monitor monitor_;
 
   [[nodiscard]] bool ReadCache() {
-    CHECK(!at_end_);
     if (!cache_info_->written) {
       return false;
     }
@@ -237,7 +236,6 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
 
     exce_.Rethrow();
 
-    auto const config = *GlobalConfigThreadLocalStore::Get();
     for (std::int32_t i = 0; i < n_prefetch_batches; ++i, ++fetch_it) {
       fetch_it %= n_batches_;  // ring
       if (ring_->at(fetch_it).valid()) {
@@ -245,8 +243,7 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
       }
       auto const* self = this;  // make sure it's const
       CHECK_LT(fetch_it, cache_info_->offset.size());
-      ring_->at(fetch_it) = this->workers_.Submit([fetch_it, self, config, this] {
-        *GlobalConfigThreadLocalStore::Get() = config;
+      ring_->at(fetch_it) = this->workers_.Submit([fetch_it, self, this] {
         auto page = std::make_shared<S>();
         this->exce_.Run([&] {
           std::unique_ptr<typename FormatStreamPolicy::FormatT> fmt{this->CreatePageFormat()};
@@ -259,11 +256,13 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
         return page;
       });
     }
+
     CHECK_EQ(std::count_if(ring_->cbegin(), ring_->cend(), [](auto const& f) { return f.valid(); }),
              n_prefetch_batches)
         << "Sparse DMatrix assumes forward iteration.";
 
     monitor_.Start("Wait");
+    CHECK((*ring_)[count_].valid());
     page_ = (*ring_)[count_].get();
     CHECK(!(*ring_)[count_].valid());
     monitor_.Stop("Wait");
@@ -296,7 +295,10 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
  public:
   SparsePageSourceImpl(float missing, int nthreads, bst_feature_t n_features, bst_idx_t n_batches,
                        std::shared_ptr<Cache> cache)
-      : workers_{std::max(2, std::min(nthreads, 16))},  // Don't use too many threads.
+      : workers_{std::max(2, std::min(nthreads, 16)),
+                 [config = *GlobalConfigThreadLocalStore::Get()] {
+                   *GlobalConfigThreadLocalStore::Get() = config;
+                 }},
         missing_{missing},
         nthreads_{nthreads},
         n_features_{n_features},
@@ -331,12 +333,28 @@ class SparsePageSourceImpl : public BatchIteratorImpl<S>, public FormatStreamPol
     return at_end_;
   }
 
+  // Call this at the last iteration.
+  void EndIter() {
+    CHECK_EQ(this->cache_info_->offset.size(), this->n_batches_ + 1);
+    this->cache_info_->Commit();
+    if (this->n_batches_ != 0) {
+      CHECK_EQ(this->count_, this->n_batches_);
+    }
+    CHECK_GE(this->count_, 1);
+    this->count_ = 0;
+  }
+
   virtual void Reset() {
     TryLockGuard guard{single_threaded_};
-    at_end_ = false;
-    count_ = 0;
-    // Pre-fetch for the next round of iterations.
-    this->Fetch();
+
+    this->at_end_ = false;
+    auto cnt = this->count_;
+    this->count_ = 0;
+    if (cnt != 0) {
+      // The last iteration did not get to the end, clear the ring to start from 0.
+      this->ring_ = std::make_unique<Ring>();
+      this->Fetch();
+    }
   }
 };
 
@@ -404,16 +422,11 @@ class SparsePageSource : public SparsePageSourceImpl<SparsePage> {
     CHECK_LE(count_, n_batches_);
 
     if (at_end_) {
-      CHECK_EQ(cache_info_->offset.size(), n_batches_ + 1);
-      cache_info_->Commit();
-      if (n_batches_ != 0) {
-        CHECK_EQ(count_, n_batches_);
-      }
-      CHECK_GE(count_, 1);
-      proxy_ = nullptr;
-    } else {
-      this->Fetch();
+      this->EndIter();
+      this->proxy_ = nullptr;
     }
+
+    this->Fetch();
     return *this;
   }
 
@@ -446,36 +459,46 @@ class PageSourceIncMixIn : public SparsePageSourceImpl<S, FormatCreatePolicy> {
   PageSourceIncMixIn(float missing, std::int32_t nthreads, bst_feature_t n_features,
                      bst_idx_t n_batches, std::shared_ptr<Cache> cache, bool sync)
       : Super::SparsePageSourceImpl{missing, nthreads, n_features, n_batches, cache}, sync_{sync} {}
-
+  // This function always operate on the source first, then the downstream. The downstream
+  // can assume the source to be ready.
   [[nodiscard]] PageSourceIncMixIn& operator++() final {
     TryLockGuard guard{this->single_threaded_};
+    // Increment the source.
     if (sync_) {
       ++(*source_);
     }
-
+    // Increment self.
     ++this->count_;
+    // Set at end.
     this->at_end_ = this->count_ == this->n_batches_;
 
     if (this->at_end_) {
-      this->cache_info_->Commit();
-      if (this->n_batches_ != 0) {
-        CHECK_EQ(this->count_, this->n_batches_);
+      // If this is the first round of iterations, we have just built the binary cache
+      // from soruce. For a non-sync page type, the source hasn't been updated to the end
+      // iteration yet due to skipped increment. We increment the source here and it will
+      // call the `EndIter` method itself.
+      bool src_need_inc = !sync_ && this->source_->Iter() != 0;
+      if (src_need_inc) {
+        CHECK_EQ(this->source_->Iter(), this->count_ - 1);
+        ++(*source_);
+      }
+      this->EndIter();
+
+      if (src_need_inc) {
+        CHECK(this->cache_info_->written);
       }
-      CHECK_GE(this->count_, 1);
-    } else {
-      this->Fetch();
     }
+    this->Fetch();
 
     if (sync_) {
+      // Sanity check.
       CHECK_EQ(source_->Iter(), this->count_);
     }
     return *this;
   }
 
   void Reset() final {
-    if (sync_) {
-      this->source_->Reset();
-    }
+    this->source_->Reset();
     Super::Reset();
   }
 };
diff --git a/src/metric/auc.cc b/src/metric/auc.cc
index 6de0d1f129cb..fcb774a4aa70 100644
--- a/src/metric/auc.cc
+++ b/src/metric/auc.cc
@@ -336,12 +336,12 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
     double auc{0};
     uint32_t valid_groups = 0;
     auto n_threads = ctx_->Threads();
-    if (ctx_->IsCPU()) {
+    if (ctx_->IsCUDA()) {
       std::tie(auc, valid_groups) =
-          RankingAUC<true>(ctx_, predts.ConstHostVector(), info, n_threads);
+          GPURankingAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_);
     } else {
       std::tie(auc, valid_groups) =
-          GPURankingAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_);
+          RankingAUC<true>(ctx_, predts.ConstHostVector(), info, n_threads);
     }
     return std::make_pair(auc, valid_groups);
   }
@@ -351,10 +351,10 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
     double auc{0};
     auto n_threads = ctx_->Threads();
     CHECK_NE(n_classes, 0);
-    if (ctx_->IsCPU()) {
-      auc = MultiClassOVR(ctx_, predts.ConstHostVector(), info, n_classes, n_threads, BinaryROCAUC);
-    } else {
+    if (ctx_->IsCUDA()) {
       auc = GPUMultiClassROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_, n_classes);
+    } else {
+      auc = MultiClassOVR(ctx_, predts.ConstHostVector(), info, n_classes, n_threads, BinaryROCAUC);
     }
     return auc;
   }
@@ -362,13 +362,13 @@ class EvalROCAUC : public EvalAUC<EvalROCAUC> {
   std::tuple<double, double, double>
   EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
     double fp, tp, auc;
-    if (ctx_->IsCPU()) {
+    if (ctx_->IsCUDA()) {
+      std::tie(fp, tp, auc) =
+          GPUBinaryROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_);
+    } else {
       std::tie(fp, tp, auc) = BinaryROCAUC(ctx_, predts.ConstHostVector(),
                                            info.labels.HostView().Slice(linalg::All(), 0),
                                            common::OptionalWeights{info.weights_.ConstHostSpan()});
-    } else {
-      std::tie(fp, tp, auc) =
-          GPUBinaryROCAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_);
     }
     return std::make_tuple(fp, tp, auc);
   }
@@ -413,23 +413,23 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
   std::tuple<double, double, double>
   EvalBinary(HostDeviceVector<float> const &predts, MetaInfo const &info) {
     double pr, re, auc;
-    if (ctx_->IsCPU()) {
+    if (ctx_->IsCUDA()) {
+      std::tie(pr, re, auc) = GPUBinaryPRAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_);
+    } else {
       std::tie(pr, re, auc) =
           BinaryPRAUC(ctx_, predts.ConstHostSpan(), info.labels.HostView().Slice(linalg::All(), 0),
                       common::OptionalWeights{info.weights_.ConstHostSpan()});
-    } else {
-      std::tie(pr, re, auc) = GPUBinaryPRAUC(ctx_, predts.ConstDeviceSpan(), info, &this->d_cache_);
     }
     return std::make_tuple(pr, re, auc);
   }
 
   double EvalMultiClass(HostDeviceVector<float> const &predts, MetaInfo const &info,
                         size_t n_classes) {
-    if (ctx_->IsCPU()) {
+    if (ctx_->IsCUDA()) {
+      return GPUMultiClassPRAUC(ctx_, predts.ConstDeviceSpan(), info, &d_cache_, n_classes);
+    } else {
       auto n_threads = this->ctx_->Threads();
       return MultiClassOVR(ctx_, predts.ConstHostSpan(), info, n_classes, n_threads, BinaryPRAUC);
-    } else {
-      return GPUMultiClassPRAUC(ctx_, predts.ConstDeviceSpan(), info, &d_cache_, n_classes);
     }
   }
 
@@ -438,16 +438,16 @@ class EvalPRAUC : public EvalAUC<EvalPRAUC> {
     double auc{0};
     uint32_t valid_groups = 0;
     auto n_threads = ctx_->Threads();
-    if (ctx_->IsCPU()) {
+    if (ctx_->IsCUDA()) {
+      std::tie(auc, valid_groups) =
+          GPURankingPRAUC(ctx_, predts.ConstDeviceSpan(), info, &d_cache_);
+    } else {
       auto labels = info.labels.Data()->ConstHostSpan();
       if (std::any_of(labels.cbegin(), labels.cend(), PRAUCLabelInvalid{})) {
         InvalidLabels();
       }
       std::tie(auc, valid_groups) =
           RankingAUC<false>(ctx_, predts.ConstHostVector(), info, n_threads);
-    } else {
-      std::tie(auc, valid_groups) =
-          GPURankingPRAUC(ctx_, predts.ConstDeviceSpan(), info, &d_cache_);
     }
     return std::make_pair(auc, valid_groups);
   }
diff --git a/src/metric/multiclass_metric.cu b/src/metric/multiclass_metric.cu
index e51509fc7339..70738fdf04e9 100644
--- a/src/metric/multiclass_metric.cu
+++ b/src/metric/multiclass_metric.cu
@@ -131,7 +131,7 @@ class MultiClassMetricsReduction {
                             const HostDeviceVector<bst_float>& preds) {
     PackedReduceResult result;
 
-    if (device.IsCPU()) {
+    if (!device.IsCUDA()) {
       result =
           CpuReduceMetrics(weights, labels, preds, n_class, ctx.Threads());
     }
diff --git a/src/metric/survival_metric.cu b/src/metric/survival_metric.cu
index 9c57be3ab2b5..d8ef7eb95b5d 100644
--- a/src/metric/survival_metric.cu
+++ b/src/metric/survival_metric.cu
@@ -127,7 +127,7 @@ class ElementWiseSurvivalMetricsReduction {
       const HostDeviceVector<bst_float>& preds) {
     PackedReduceResult result;
 
-    if (ctx.IsCPU()) {
+    if (!ctx.IsCUDA()) {
       result = CpuReduceMetrics(weights, labels_lower_bound, labels_upper_bound,
                                 preds, ctx.Threads());
     }
diff --git a/src/tree/common_row_partitioner.h b/src/tree/common_row_partitioner.h
index c3065ad5f135..cd267673b66c 100644
--- a/src/tree/common_row_partitioner.h
+++ b/src/tree/common_row_partitioner.h
@@ -7,7 +7,7 @@
 #define XGBOOST_TREE_COMMON_ROW_PARTITIONER_H_
 
 #include <algorithm>  // for all_of, fill
-#include <cinttypes>  // for uint32_t
+#include <cstdint>    // for uint32_t, int32_t
 #include <limits>     // for numeric_limits
 #include <vector>     // for vector
 
@@ -18,7 +18,7 @@
 #include "../common/partition_builder.h"  // for PartitionBuilder
 #include "../common/row_set.h"            // for RowSetCollection
 #include "../common/threading_utils.h"    // for ParallelFor2d
-#include "xgboost/base.h"                 // for bst_row_t
+#include "xgboost/base.h"                 // for bst_idx_t
 #include "xgboost/collective/result.h"    // for Success, SafeColl
 #include "xgboost/context.h"              // for Context
 #include "xgboost/linalg.h"               // for TensorView
@@ -46,7 +46,7 @@ class ColumnSplitHelper {
   void Partition(Context const* ctx, common::BlockedSpace2d const& space, std::int32_t n_threads,
                  GHistIndexMatrix const& gmat, common::ColumnMatrix const& column_matrix,
                  std::vector<ExpandEntry> const& nodes,
-                 std::vector<int32_t> const& split_conditions, RegTree const* p_tree) {
+                 std::vector<std::int32_t> const& split_conditions, RegTree const* p_tree) {
     // When data is split by column, we don't have all the feature values in the local worker, so
     // we first collect all the decisions and whether the feature is missing into bit vectors.
     std::fill(decision_storage_.begin(), decision_storage_.end(), 0);
@@ -56,7 +56,7 @@ class ColumnSplitHelper {
       bst_bin_t split_cond = column_matrix.IsInitialized() ? split_conditions[node_in_set] : 0;
       partition_builder_->MaskRows<BinIdxType, any_missing, any_cat>(
           node_in_set, nodes, r, split_cond, gmat, column_matrix, *p_tree,
-          (*row_set_collection_)[nid].begin, &decision_bits_, &missing_bits_);
+          (*row_set_collection_)[nid].begin(), &decision_bits_, &missing_bits_);
     });
 
     // Then aggregate the bit vectors across all the workers.
@@ -74,7 +74,7 @@ class ColumnSplitHelper {
       const size_t task_id = partition_builder_->GetTaskIdx(node_in_set, begin);
       partition_builder_->AllocateForTask(task_id);
       partition_builder_->PartitionByMask(node_in_set, nodes, r, gmat, *p_tree,
-                                          (*row_set_collection_)[nid].begin, decision_bits_,
+                                          (*row_set_collection_)[nid].begin(), decision_bits_,
                                           missing_bits_);
     });
   }
@@ -98,10 +98,10 @@ class CommonRowPartitioner {
                        bool is_col_split)
       : base_rowid{_base_rowid}, is_col_split_{is_col_split} {
     row_set_collection_.Clear();
-    std::vector<size_t>& row_indices = *row_set_collection_.Data();
+    std::vector<bst_idx_t>& row_indices = *row_set_collection_.Data();
     row_indices.resize(num_row);
 
-    std::size_t* p_row_indices = row_indices.data();
+    bst_idx_t* p_row_indices = row_indices.data();
     common::Iota(ctx, p_row_indices, p_row_indices + row_indices.size(), base_rowid);
     row_set_collection_.Init();
 
@@ -112,7 +112,7 @@ class CommonRowPartitioner {
 
   template <typename ExpandEntry>
   void FindSplitConditions(const std::vector<ExpandEntry>& nodes, const RegTree& tree,
-                           const GHistIndexMatrix& gmat, std::vector<int32_t>* split_conditions) {
+                           const GHistIndexMatrix& gmat, std::vector<bst_bin_t>* split_conditions) {
     auto const& ptrs = gmat.cut.Ptrs();
     auto const& vals = gmat.cut.Values();
 
@@ -197,7 +197,7 @@ class CommonRowPartitioner {
     // 1. Find split condition for each split
     size_t n_nodes = nodes.size();
 
-    std::vector<int32_t> split_conditions;
+    std::vector<bst_bin_t> split_conditions;
     if (column_matrix.IsInitialized()) {
       split_conditions.resize(n_nodes);
       FindSplitConditions(nodes, *p_tree, gmat, &split_conditions);
@@ -206,8 +206,8 @@ class CommonRowPartitioner {
     // 2.1 Create a blocked space of size SUM(samples in each node)
     common::BlockedSpace2d space(
         n_nodes,
-        [&](size_t node_in_set) {
-          int32_t nid = nodes[node_in_set].nid;
+        [&](std::size_t node_in_set) {
+          auto nid = nodes[node_in_set].nid;
           return row_set_collection_[nid].Size();
         },
         kPartitionBlockSize);
@@ -236,7 +236,7 @@ class CommonRowPartitioner {
         bst_bin_t split_cond = column_matrix.IsInitialized() ? split_conditions[node_in_set] : 0;
         partition_builder_.template Partition<BinIdxType, any_missing, any_cat>(
             node_in_set, nodes, r, split_cond, gmat, column_matrix, *p_tree,
-            row_set_collection_[nid].begin);
+            row_set_collection_[nid].begin());
       });
     }
 
@@ -248,8 +248,7 @@ class CommonRowPartitioner {
     // with updated row-indexes for each tree-node
     common::ParallelFor2d(space, ctx->Threads(), [&](size_t node_in_set, common::Range1d r) {
       const int32_t nid = nodes[node_in_set].nid;
-      partition_builder_.MergeToArray(node_in_set, r.begin(),
-                                      const_cast<size_t*>(row_set_collection_[nid].begin));
+      partition_builder_.MergeToArray(node_in_set, r.begin(), row_set_collection_[nid].begin());
     });
 
     // 5. Add info about splits into row_set_collection_
diff --git a/src/tree/gpu_hist/feature_groups.cu b/src/tree/gpu_hist/feature_groups.cu
index 27ed9bd919c8..52e58da7efbb 100644
--- a/src/tree/gpu_hist/feature_groups.cu
+++ b/src/tree/gpu_hist/feature_groups.cu
@@ -1,5 +1,5 @@
-/*!
- * Copyright 2020 by XGBoost Contributors
+/**
+ * Copyright 2020-2024, XGBoost Contributors
  */
 
 #include <xgboost/base.h>
@@ -8,12 +8,9 @@
 
 #include "feature_groups.cuh"
 
-#include "../../common/device_helpers.cuh"
 #include "../../common/hist_util.h"
 
-namespace xgboost {
-namespace tree {
-
+namespace xgboost::tree {
 FeatureGroups::FeatureGroups(const common::HistogramCuts& cuts, bool is_dense,
                              size_t shm_size, size_t bin_size) {
   // Only use a single feature group for sparse matrices.
@@ -59,6 +56,4 @@ void FeatureGroups::InitSingle(const common::HistogramCuts& cuts) {
 
   max_group_bins = cuts.TotalBins();
 }
-
-}  // namespace tree
-}  // namespace xgboost
+}  // namespace xgboost::tree
diff --git a/src/tree/gpu_hist/histogram.cu b/src/tree/gpu_hist/histogram.cu
index 90c151556566..372a5c09ba0c 100644
--- a/src/tree/gpu_hist/histogram.cu
+++ b/src/tree/gpu_hist/histogram.cu
@@ -1,12 +1,10 @@
 /**
  * Copyright 2020-2024, XGBoost Contributors
  */
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/reduce.h>
+#include <thrust/iterator/transform_iterator.h>  // for make_transform_iterator
 
 #include <algorithm>
-#include <cstdint>  // uint32_t
-#include <limits>
+#include <cstdint>  // uint32_t, int32_t
 
 #include "../../collective/aggregator.h"
 #include "../../common/deterministic.cuh"
@@ -102,9 +100,8 @@ GradientQuantiser::GradientQuantiser(Context const* ctx, common::Span<GradientPa
                                  static_cast<T>(1) / to_floating_point_.GetHess());
 }
 
-XGBOOST_DEV_INLINE void
-AtomicAddGpairShared(xgboost::GradientPairInt64 *dest,
-               xgboost::GradientPairInt64 const &gpair) {
+XGBOOST_DEV_INLINE void AtomicAddGpairShared(xgboost::GradientPairInt64* dest,
+                                             xgboost::GradientPairInt64 const& gpair) {
   auto dst_ptr = reinterpret_cast<int64_t *>(dest);
   auto g = gpair.GetQuantisedGrad();
   auto h = gpair.GetQuantisedHess();
@@ -128,11 +125,13 @@ XGBOOST_DEV_INLINE void AtomicAddGpairGlobal(xgboost::GradientPairInt64* dest,
 }
 
 template <int kBlockThreads, int kItemsPerThread,
-          int kItemsPerTile = kBlockThreads* kItemsPerThread>
+          int kItemsPerTile = kBlockThreads * kItemsPerThread>
 class HistogramAgent {
   GradientPairInt64* smem_arr_;
   GradientPairInt64* d_node_hist_;
-  dh::LDGIterator<const RowPartitioner::RowIndexT> d_ridx_;
+  using Idx = RowPartitioner::RowIndexT;
+
+  dh::LDGIterator<const Idx> d_ridx_;
   const GradientPair* d_gpair_;
   const FeatureGroup group_;
   const EllpackDeviceAccessor& matrix_;
@@ -143,8 +142,7 @@ class HistogramAgent {
  public:
   __device__ HistogramAgent(GradientPairInt64* smem_arr,
                             GradientPairInt64* __restrict__ d_node_hist, const FeatureGroup& group,
-                            const EllpackDeviceAccessor& matrix,
-                            common::Span<const RowPartitioner::RowIndexT> d_ridx,
+                            const EllpackDeviceAccessor& matrix, common::Span<const Idx> d_ridx,
                             const GradientQuantiser& rounding, const GradientPair* d_gpair)
       : smem_arr_(smem_arr),
         d_node_hist_(d_node_hist),
@@ -155,15 +153,15 @@ class HistogramAgent {
         n_elements_(feature_stride_ * d_ridx.size()),
         rounding_(rounding),
         d_gpair_(d_gpair) {}
+
   __device__ void ProcessPartialTileShared(std::size_t offset) {
     for (std::size_t idx = offset + threadIdx.x;
          idx < std::min(offset + kBlockThreads * kItemsPerTile, n_elements_);
          idx += kBlockThreads) {
-      int ridx = d_ridx_[idx / feature_stride_];
-      int gidx =
-          matrix_
-              .gidx_iter[ridx * matrix_.row_stride + group_.start_feature + idx % feature_stride_] -
-          group_.start_bin;
+      Idx ridx = d_ridx_[idx / feature_stride_];
+      Idx midx = (ridx - matrix_.base_rowid) * matrix_.row_stride + group_.start_feature +
+                  idx % feature_stride_;
+      bst_bin_t gidx = matrix_.gidx_iter[midx] - group_.start_bin;
       if (matrix_.is_dense || gidx != matrix_.NumBins()) {
         auto adjusted = rounding_.ToFixedPoint(d_gpair_[ridx]);
         AtomicAddGpairShared(smem_arr_ + gidx, adjusted);
@@ -189,8 +187,8 @@ class HistogramAgent {
 #pragma unroll
     for (int i = 0; i < kItemsPerThread; i++) {
       gpair[i] = d_gpair_[ridx[i]];
-      gidx[i] = matrix_.gidx_iter[ridx[i] * matrix_.row_stride + group_.start_feature +
-                                 idx[i] % feature_stride_];
+      gidx[i] = matrix_.gidx_iter[(ridx[i] - matrix_.base_rowid) * matrix_.row_stride +
+                                  group_.start_feature + idx[i] % feature_stride_];
     }
 #pragma unroll
     for (int i = 0; i < kItemsPerThread; i++) {
@@ -201,7 +199,7 @@ class HistogramAgent {
     }
   }
   __device__ void BuildHistogramWithShared() {
-    dh::BlockFill(smem_arr_, group_.num_bins, GradientPairInt64());
+    dh::BlockFill(smem_arr_, group_.num_bins, GradientPairInt64{});
     __syncthreads();
 
     std::size_t offset = blockIdx.x * kItemsPerTile;
@@ -220,10 +218,9 @@ class HistogramAgent {
 
   __device__ void BuildHistogramWithGlobal() {
     for (auto idx : dh::GridStrideRange(static_cast<std::size_t>(0), n_elements_)) {
-      int ridx = d_ridx_[idx / feature_stride_];
-      int gidx =
-          matrix_
-              .gidx_iter[ridx * matrix_.row_stride + group_.start_feature + idx % feature_stride_];
+      Idx ridx = d_ridx_[idx / feature_stride_];
+      bst_bin_t gidx = matrix_.gidx_iter[(ridx - matrix_.base_rowid) * matrix_.row_stride +
+                                         group_.start_feature + idx % feature_stride_];
       if (matrix_.is_dense || gidx != matrix_.NumBins()) {
         auto adjusted = rounding_.ToFixedPoint(d_gpair_[ridx]);
         AtomicAddGpairGlobal(d_node_hist_ + gidx, adjusted);
@@ -232,8 +229,7 @@ class HistogramAgent {
   }
 };
 
-template <bool use_shared_memory_histograms, int kBlockThreads,
-          int kItemsPerThread>
+template <bool use_shared_memory_histograms, int kBlockThreads, int kItemsPerThread>
 __global__ void __launch_bounds__(kBlockThreads)
     SharedMemHistKernel(const EllpackDeviceAccessor matrix,
                         const FeatureGroupsAccessor feature_groups,
@@ -244,8 +240,8 @@ __global__ void __launch_bounds__(kBlockThreads)
   extern __shared__ char smem[];
   const FeatureGroup group = feature_groups[blockIdx.y];
   auto smem_arr = reinterpret_cast<GradientPairInt64*>(smem);
-  auto agent = HistogramAgent<kBlockThreads, kItemsPerThread>(
-      smem_arr, d_node_hist, group, matrix, d_ridx, rounding, d_gpair);
+  auto agent = HistogramAgent<kBlockThreads, kItemsPerThread>(smem_arr, d_node_hist, group, matrix,
+                                                              d_ridx, rounding, d_gpair);
   if (use_shared_memory_histograms) {
     agent.BuildHistogramWithShared();
   } else {
@@ -253,44 +249,74 @@ __global__ void __launch_bounds__(kBlockThreads)
   }
 }
 
-void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
-                            FeatureGroupsAccessor const& feature_groups,
-                            common::Span<GradientPair const> gpair,
-                            common::Span<const uint32_t> d_ridx,
-                            common::Span<GradientPairInt64> histogram, GradientQuantiser rounding,
-                            bool force_global_memory) {
-  // decide whether to use shared memory
-  int device = 0;
-  dh::safe_cuda(cudaGetDevice(&device));
-  // opt into maximum shared memory for the kernel if necessary
-  size_t max_shared_memory = dh::MaxSharedMemoryOptin(device);
-
-  size_t smem_size =
-      sizeof(GradientPairInt64) * feature_groups.max_group_bins;
-  bool shared = !force_global_memory && smem_size <= max_shared_memory;
-  smem_size = shared ? smem_size : 0;
-
-  constexpr int kBlockThreads = 1024;
-  constexpr int kItemsPerThread = 8;
-  constexpr int kItemsPerTile = kBlockThreads * kItemsPerThread;
-
-  auto runit = [&, kMinItemsPerBlock = kItemsPerTile](auto kernel) {
-    if (shared) {
-      dh::safe_cuda(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
-                                         max_shared_memory));
-    }
+namespace {
+constexpr std::int32_t kBlockThreads = 1024;
+constexpr std::int32_t kItemsPerThread = 8;
+constexpr std::int32_t ItemsPerTile() { return kBlockThreads * kItemsPerThread; }
+}  // namespace
+
+// Use auto deduction guide to workaround compiler error.
+template <auto Global = SharedMemHistKernel<false, kBlockThreads, kItemsPerThread>,
+          auto Shared = SharedMemHistKernel<true, kBlockThreads, kItemsPerThread>>
+struct HistogramKernel {
+  decltype(Global) global_kernel{SharedMemHistKernel<false, kBlockThreads, kItemsPerThread>};
+  decltype(Shared) shared_kernel{SharedMemHistKernel<true, kBlockThreads, kItemsPerThread>};
+  bool shared{false};
+  std::uint32_t grid_size{0};
+  std::size_t smem_size{0};
+
+  HistogramKernel(Context const* ctx, FeatureGroupsAccessor const& feature_groups,
+                  bool force_global_memory) {
+    // Decide whether to use shared memory
+    // Opt into maximum shared memory for the kernel if necessary
+    std::size_t max_shared_memory = dh::MaxSharedMemoryOptin(ctx->Ordinal());
+
+    this->smem_size = sizeof(GradientPairInt64) * feature_groups.max_group_bins;
+    this->shared = !force_global_memory && smem_size <= max_shared_memory;
+    this->smem_size = this->shared ? this->smem_size : 0;
+
+    auto init = [&](auto& kernel) {
+      if (this->shared) {
+        dh::safe_cuda(cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                           max_shared_memory));
+      }
+
+      // determine the launch configuration
+      std::int32_t num_groups = feature_groups.NumGroups();
+      std::int32_t n_mps = 0;
+      dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, ctx->Ordinal()));
+
+      std::int32_t n_blocks_per_mp = 0;
+      dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
+                                                                  kBlockThreads, this->smem_size));
+
+      // This gives the number of blocks to keep the device occupied Use this as the
+      // maximum number of blocks
+      this->grid_size = n_blocks_per_mp * n_mps;
+    };
+
+    init(this->global_kernel);
+    init(this->shared_kernel);
+  }
+};
+
+class DeviceHistogramBuilderImpl {
+  std::unique_ptr<HistogramKernel<>> kernel_{nullptr};
+  bool force_global_memory_{false};
 
-    // determine the launch configuration
-    int num_groups = feature_groups.NumGroups();
-    int n_mps = 0;
-    dh::safe_cuda(cudaDeviceGetAttribute(&n_mps, cudaDevAttrMultiProcessorCount, device));
-    int n_blocks_per_mp = 0;
-    dh::safe_cuda(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&n_blocks_per_mp, kernel,
-                                                                kBlockThreads, smem_size));
-    // This gives the number of blocks to keep the device occupied
-    // Use this as the maximum number of blocks
-    unsigned grid_size = n_blocks_per_mp * n_mps;
+ public:
+  void Reset(Context const* ctx, FeatureGroupsAccessor const& feature_groups,
+             bool force_global_memory) {
+    this->kernel_ = std::make_unique<HistogramKernel<>>(ctx, feature_groups, force_global_memory);
+    this->force_global_memory_ = force_global_memory;
+  }
 
+  void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
+                      FeatureGroupsAccessor const& feature_groups,
+                      common::Span<GradientPair const> gpair,
+                      common::Span<const std::uint32_t> d_ridx,
+                      common::Span<GradientPairInt64> histogram, GradientQuantiser rounding) {
+    CHECK(kernel_);
     // Otherwise launch blocks such that each block has a minimum amount of work to do
     // There are fixed costs to launching each block, e.g. zeroing shared memory
     // The below amount of minimum work was found by experimentation
@@ -300,20 +326,41 @@ void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const&
 
     // Allocate number of blocks such that each block has about kMinItemsPerBlock work
     // Up to a maximum where the device is saturated
-    grid_size = std::min(grid_size, static_cast<std::uint32_t>(
-                                        common::DivRoundUp(items_per_group, kMinItemsPerBlock)));
+    auto constexpr kMinItemsPerBlock = ItemsPerTile();
+    auto grid_size = std::min(kernel_->grid_size, static_cast<std::uint32_t>(common::DivRoundUp(
+                                                      items_per_group, kMinItemsPerBlock)));
+
+    if (this->force_global_memory_ || !this->kernel_->shared) {
+      dh::LaunchKernel{dim3(grid_size, feature_groups.NumGroups()),  // NOLINT
+                       static_cast<uint32_t>(kBlockThreads), kernel_->smem_size,
+                       ctx->Stream()}(kernel_->global_kernel, matrix, feature_groups, d_ridx,
+                                      histogram.data(), gpair.data(), rounding);
+    } else {
+      dh::LaunchKernel{dim3(grid_size, feature_groups.NumGroups()),  // NOLINT
+                       static_cast<uint32_t>(kBlockThreads), kernel_->smem_size,
+                       ctx->Stream()}(kernel_->shared_kernel, matrix, feature_groups, d_ridx,
+                                      histogram.data(), gpair.data(), rounding);
+    }
+  }
+};
 
-    dh::LaunchKernel {dim3(grid_size, num_groups), static_cast<uint32_t>(kBlockThreads), smem_size,
-                     ctx->Stream()} (kernel, matrix, feature_groups, d_ridx, histogram.data(),
-                                     gpair.data(), rounding);
-  };
+DeviceHistogramBuilder::DeviceHistogramBuilder()
+    : p_impl_{std::make_unique<DeviceHistogramBuilderImpl>()} {}
 
-  if (shared) {
-    runit(SharedMemHistKernel<true, kBlockThreads, kItemsPerThread>);
-  } else {
-    runit(SharedMemHistKernel<false, kBlockThreads, kItemsPerThread>);
-  }
+DeviceHistogramBuilder::~DeviceHistogramBuilder() = default;
+
+void DeviceHistogramBuilder::Reset(Context const* ctx, FeatureGroupsAccessor const& feature_groups,
+                                   bool force_global_memory) {
+  this->p_impl_->Reset(ctx, feature_groups, force_global_memory);
+}
 
-  dh::safe_cuda(cudaGetLastError());
+void DeviceHistogramBuilder::BuildHistogram(CUDAContext const* ctx,
+                                            EllpackDeviceAccessor const& matrix,
+                                            FeatureGroupsAccessor const& feature_groups,
+                                            common::Span<GradientPair const> gpair,
+                                            common::Span<const std::uint32_t> ridx,
+                                            common::Span<GradientPairInt64> histogram,
+                                            GradientQuantiser rounding) {
+  this->p_impl_->BuildHistogram(ctx, matrix, feature_groups, gpair, ridx, histogram, rounding);
 }
 }  // namespace xgboost::tree
diff --git a/src/tree/gpu_hist/histogram.cuh b/src/tree/gpu_hist/histogram.cuh
index 925c548936f4..862821b00b63 100644
--- a/src/tree/gpu_hist/histogram.cuh
+++ b/src/tree/gpu_hist/histogram.cuh
@@ -1,17 +1,18 @@
-/*!
- * Copyright 2020-2021 by XGBoost Contributors
+/**
+ * Copyright 2020-2024, XGBoost Contributors
  */
 #ifndef HISTOGRAM_CUH_
 #define HISTOGRAM_CUH_
-#include <thrust/transform.h>
-
-#include "../../common/cuda_context.cuh"
-#include "../../data/ellpack_page.cuh"
-#include "feature_groups.cuh"
+#include <memory>  // for unique_ptr
 
-namespace xgboost {
-namespace tree {
+#include "../../common/cuda_context.cuh"  // for CUDAContext
+#include "../../data/ellpack_page.cuh"    // for EllpackDeviceAccessor
+#include "feature_groups.cuh"             // for FeatureGroupsAccessor
+#include "xgboost/base.h"                 // for GradientPair, GradientPairInt64
+#include "xgboost/context.h"              // for Context
+#include "xgboost/span.h"                 // for Span
 
+namespace xgboost::tree {
 /**
  * \brief An atomicAdd designed for gradient pair with better performance.  For general
  *        int64_t atomicAdd, one can simply cast it to unsigned long long. Exposed for testing.
@@ -32,7 +33,7 @@ XGBOOST_DEV_INLINE void AtomicAdd64As32(int64_t* dst, int64_t src) {
 }
 
 class GradientQuantiser {
-private:
+ private:
   /* Convert gradient to fixed point representation. */
   GradientPairPrecise to_fixed_point_;
   /* Convert fixed point representation back to floating point. */
@@ -59,13 +60,22 @@ private:
   }
 };
 
-void BuildGradientHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
-                            FeatureGroupsAccessor const& feature_groups,
-                            common::Span<GradientPair const> gpair,
-                            common::Span<const uint32_t> ridx,
-                            common::Span<GradientPairInt64> histogram, GradientQuantiser rounding,
-                            bool force_global_memory = false);
-}  // namespace tree
-}  // namespace xgboost
+class DeviceHistogramBuilderImpl;
+
+class DeviceHistogramBuilder {
+  std::unique_ptr<DeviceHistogramBuilderImpl> p_impl_;
 
+ public:
+  DeviceHistogramBuilder();
+  ~DeviceHistogramBuilder();
+
+  void Reset(Context const* ctx, FeatureGroupsAccessor const& feature_groups,
+             bool force_global_memory);
+  void BuildHistogram(CUDAContext const* ctx, EllpackDeviceAccessor const& matrix,
+                      FeatureGroupsAccessor const& feature_groups,
+                      common::Span<GradientPair const> gpair,
+                      common::Span<const std::uint32_t> ridx,
+                      common::Span<GradientPairInt64> histogram, GradientQuantiser rounding);
+};
+}  // namespace xgboost::tree
 #endif  // HISTOGRAM_CUH_
diff --git a/src/tree/gpu_hist/row_partitioner.cu b/src/tree/gpu_hist/row_partitioner.cu
index 35b43d24bd08..f66fac489da3 100644
--- a/src/tree/gpu_hist/row_partitioner.cu
+++ b/src/tree/gpu_hist/row_partitioner.cu
@@ -1,28 +1,23 @@
-/*!
- * Copyright 2017-2022 XGBoost contributors
+/**
+ * Copyright 2017-2024, XGBoost contributors
  */
-#include <thrust/iterator/discard_iterator.h>
-#include <thrust/iterator/transform_output_iterator.h>
-#include <thrust/sequence.h>
+#include <thrust/sequence.h>  // for sequence
 
-#include <vector>
+#include <vector>  // for vector
 
-#include "../../common/device_helpers.cuh"
+#include "../../common/cuda_context.cuh"    // for CUDAContext
+#include "../../common/device_helpers.cuh"  // for CopyDeviceSpanToVector, ToSpan
 #include "row_partitioner.cuh"
 
-namespace xgboost {
-namespace tree {
-
-RowPartitioner::RowPartitioner(DeviceOrd device_idx, size_t num_rows)
-    : device_idx_(device_idx), ridx_(num_rows), ridx_tmp_(num_rows) {
+namespace xgboost::tree {
+RowPartitioner::RowPartitioner(Context const* ctx, bst_idx_t n_samples, bst_idx_t base_rowid)
+    : device_idx_(ctx->Device()), ridx_(n_samples), ridx_tmp_(n_samples) {
   dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
-  ridx_segments_.emplace_back(NodePositionInfo{Segment(0, num_rows)});
-  thrust::sequence(thrust::device, ridx_.data(), ridx_.data() + ridx_.size());
+  ridx_segments_.emplace_back(NodePositionInfo{Segment(0, n_samples)});
+  thrust::sequence(ctx->CUDACtx()->CTP(), ridx_.data(), ridx_.data() + ridx_.size(), base_rowid);
 }
 
-RowPartitioner::~RowPartitioner() {
-  dh::safe_cuda(cudaSetDevice(device_idx_.ordinal));
-}
+RowPartitioner::~RowPartitioner() { dh::safe_cuda(cudaSetDevice(device_idx_.ordinal)); }
 
 common::Span<const RowPartitioner::RowIndexT> RowPartitioner::GetRows(bst_node_t nidx) {
   auto segment = ridx_segments_.at(nidx).segment;
@@ -39,6 +34,4 @@ std::vector<RowPartitioner::RowIndexT> RowPartitioner::GetRowsHost(bst_node_t ni
   dh::CopyDeviceSpanToVector(&rows, span);
   return rows;
 }
-
-};  // namespace tree
-};  // namespace xgboost
+};  // namespace xgboost::tree
diff --git a/src/tree/gpu_hist/row_partitioner.cuh b/src/tree/gpu_hist/row_partitioner.cuh
index fde6c4dd0fa9..636de54e6c25 100644
--- a/src/tree/gpu_hist/row_partitioner.cuh
+++ b/src/tree/gpu_hist/row_partitioner.cuh
@@ -1,17 +1,17 @@
-/*!
- * Copyright 2017-2022 XGBoost contributors
+/**
+ * Copyright 2017-2024, XGBoost contributors
  */
 #pragma once
 #include <thrust/execution_policy.h>
+#include <thrust/iterator/counting_iterator.h>          // for make_counting_iterator
+#include <thrust/iterator/transform_output_iterator.h>  // for make_transform_output_iterator
 
-#include <limits>
-#include <vector>
+#include <algorithm>  // for max
+#include <vector>     // for vector
 
-#include "../../common/device_helpers.cuh"
-#include "xgboost/base.h"
-#include "xgboost/context.h"
-#include "xgboost/task.h"
-#include "xgboost/tree_model.h"
+#include "../../common/device_helpers.cuh"  // for MakeTransformIterator
+#include "xgboost/base.h"                   // for bst_idx_t
+#include "xgboost/context.h"                // for Context
 
 namespace xgboost {
 namespace tree {
@@ -223,7 +223,12 @@ class RowPartitioner {
   dh::PinnedMemory pinned2_;
 
  public:
-  RowPartitioner(DeviceOrd device_idx, size_t num_rows);
+  /**
+   * @param ctx Context for device ordinal and stream.
+   * @param n_samples The number of samples in each batch.
+   * @param base_rowid The base row index for the current batch.
+   */
+  RowPartitioner(Context const* ctx, bst_idx_t n_samples, bst_idx_t base_rowid);
   ~RowPartitioner();
   RowPartitioner(const RowPartitioner&) = delete;
   RowPartitioner& operator=(const RowPartitioner&) = delete;
diff --git a/src/tree/hist/evaluate_splits.h b/src/tree/hist/evaluate_splits.h
index cc312d31ce35..12f53b99549b 100644
--- a/src/tree/hist/evaluate_splits.h
+++ b/src/tree/hist/evaluate_splits.h
@@ -776,7 +776,7 @@ void UpdatePredictionCacheImpl(Context const *ctx, RegTree const *p_last_tree,
       if (!tree[nidx].IsDeleted() && tree[nidx].IsLeaf()) {
         auto const &rowset = part[nidx];
         auto leaf_value = tree[nidx].LeafValue();
-        for (const size_t *it = rowset.begin + r.begin(); it < rowset.begin + r.end(); ++it) {
+        for (auto const *it = rowset.begin() + r.begin(); it < rowset.begin() + r.end(); ++it) {
           out_preds(*it) += leaf_value;
         }
       }
@@ -811,7 +811,8 @@ void UpdatePredictionCacheImpl(Context const *ctx, RegTree const *p_last_tree,
       if (tree.IsLeaf(nidx)) {
         auto const &rowset = part[nidx];
         auto leaf_value = mttree->LeafValue(nidx);
-        for (std::size_t const *it = rowset.begin + r.begin(); it < rowset.begin + r.end(); ++it) {
+        for (bst_idx_t const *it = rowset.begin() + r.begin(); it < rowset.begin() + r.end();
+             ++it) {
           for (std::size_t i = 0; i < n_targets; ++i) {
             out_preds(*it, i) += leaf_value(i);
           }
diff --git a/src/tree/hist/histogram.h b/src/tree/hist/histogram.h
index e589ae620cac..4bbab25987df 100644
--- a/src/tree/hist/histogram.h
+++ b/src/tree/hist/histogram.h
@@ -78,13 +78,13 @@ class HistogramBuilder {
     common::ParallelFor2d(space, this->n_threads_, [&](size_t nid_in_set, common::Range1d r) {
       const auto tid = static_cast<unsigned>(omp_get_thread_num());
       bst_node_t const nidx = nodes_to_build[nid_in_set];
-      auto elem = row_set_collection[nidx];
+      auto const& elem = row_set_collection[nidx];
       auto start_of_row_set = std::min(r.begin(), elem.Size());
       auto end_of_row_set = std::min(r.end(), elem.Size());
-      auto rid_set = common::RowSetCollection::Elem(elem.begin + start_of_row_set,
-                                                    elem.begin + end_of_row_set, nidx);
+      auto rid_set = common::Span<bst_idx_t const>{elem.begin() + start_of_row_set,
+                                                   elem.begin() + end_of_row_set};
       auto hist = buffer_.GetInitializedHist(tid, nid_in_set);
-      if (rid_set.Size() != 0) {
+      if (rid_set.size() != 0) {
         common::BuildHist<any_missing>(gpair_h, rid_set, gidx, hist, force_read_by_column);
       }
     });
diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu
index 958fa0331569..366cf3aad08e 100644
--- a/src/tree/updater_gpu_hist.cu
+++ b/src/tree/updater_gpu_hist.cu
@@ -162,6 +162,8 @@ struct GPUHistMakerDevice {
   std::shared_ptr<common::ColumnSampler> column_sampler_;
   MetaInfo const& info_;
 
+  DeviceHistogramBuilder histogram_;
+
  public:
   EllpackPageImpl const* page{nullptr};
   common::Span<FeatureType const> feature_types;
@@ -249,13 +251,16 @@ struct GPUHistMakerDevice {
     quantiser = std::make_unique<GradientQuantiser>(ctx_, this->gpair, dmat->Info());
 
     row_partitioner.reset();  // Release the device memory first before reallocating
-    row_partitioner = std::make_unique<RowPartitioner>(ctx_->Device(), sample.sample_rows);
+    CHECK_EQ(page->base_rowid, 0);
+    row_partitioner = std::make_unique<RowPartitioner>(ctx_, sample.sample_rows, page->base_rowid);
 
     // Init histogram
     hist.Init(ctx_->Device(), page->Cuts().TotalBins());
     hist.Reset();
 
     this->InitFeatureGroupsOnce();
+
+    this->histogram_.Reset(ctx_, feature_groups->DeviceAccessor(ctx_->Device()), false);
   }
 
   GPUExpandEntry EvaluateRootSplit(GradientPairInt64 root_sum) {
@@ -340,9 +345,9 @@ struct GPUHistMakerDevice {
   void BuildHist(int nidx) {
     auto d_node_hist = hist.GetNodeHistogram(nidx);
     auto d_ridx = row_partitioner->GetRows(nidx);
-    BuildGradientHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->Device()),
-                           feature_groups->DeviceAccessor(ctx_->Device()), gpair, d_ridx,
-                           d_node_hist, *quantiser);
+    this->histogram_.BuildHistogram(ctx_->CUDACtx(), page->GetDeviceAccessor(ctx_->Device()),
+                                    feature_groups->DeviceAccessor(ctx_->Device()), gpair, d_ridx,
+                                    d_node_hist, *quantiser);
   }
 
   // Attempt to do subtraction trick
diff --git a/tests/ci_build/lint_python.py b/tests/ci_build/lint_python.py
index 079996de66fb..f8bbbc2848b0 100644
--- a/tests/ci_build/lint_python.py
+++ b/tests/ci_build/lint_python.py
@@ -98,6 +98,7 @@ class LintersPaths:
         "tests/python/test_model_io.py",
         "tests/test_distributed/test_federated/",
         "tests/test_distributed/test_gpu_federated/",
+        "tests/test_distributed/test_with_dask/test_external_memory.py",
         "tests/test_distributed/test_with_spark/test_data.py",
         "tests/test_distributed/test_gpu_with_spark/test_data.py",
         "tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py",
diff --git a/tests/cpp/CMakeLists.txt b/tests/cpp/CMakeLists.txt
index 6496f8af45de..deed08165bc2 100644
--- a/tests/cpp/CMakeLists.txt
+++ b/tests/cpp/CMakeLists.txt
@@ -1,3 +1,6 @@
+# The testxgboost executable is created in the top level CMakeLists. Most of the
+# properties and compilation flags are already set. We just need to add source files and
+# link gtest here.
 if(USE_DMLC_GTEST)
   if(NOT TARGET gtest)
     message(FATAL_ERROR "USE_DMLC_GTEST=ON but dmlc-core didn't bundle gtest")
@@ -6,6 +9,7 @@ if(USE_DMLC_GTEST)
 else()
   find_package(GTest REQUIRED)
 endif()
+
 file(GLOB_RECURSE TEST_SOURCES "*.cc")
 
 if(USE_CUDA)
@@ -13,6 +17,10 @@ if(USE_CUDA)
   list(APPEND TEST_SOURCES ${CUDA_TEST_SOURCES})
 endif()
 
+# We will add them back later to separate the definition.
+file(GLOB_RECURSE FEDERATED_TEST_SOURCES "plugin/federated/*.*")
+list(REMOVE_ITEM TEST_SOURCES ${FEDERATED_TEST_SOURCES})
+
 file(GLOB_RECURSE SYCL_TEST_SOURCES "plugin/test_sycl_*.cc")
 list(REMOVE_ITEM TEST_SOURCES ${SYCL_TEST_SOURCES})
 
@@ -48,14 +56,14 @@ if(PLUGIN_SYCL)
 endif()
 
 if(PLUGIN_FEDERATED)
-  target_include_directories(testxgboost PRIVATE ${xgboost_SOURCE_DIR}/plugin/federated)
-  target_link_libraries(testxgboost PRIVATE federated_client)
-else()
-  file(GLOB_RECURSE FEDERATED_TEST_SOURCES "plugin/*_federated_*.*")
-  list(REMOVE_ITEM TEST_SOURCES ${FEDERATED_TEST_SOURCES})
+  add_subdirectory(${xgboost_SOURCE_DIR}/tests/cpp/plugin/federated)
 endif()
 
-target_sources(testxgboost PRIVATE ${TEST_SOURCES} ${xgboost_SOURCE_DIR}/plugin/example/custom_obj.cc)
+target_sources(
+  testxgboost PRIVATE
+  ${TEST_SOURCES}
+  ${xgboost_SOURCE_DIR}/plugin/example/custom_obj.cc
+)
 
 if(USE_CUDA AND PLUGIN_RMM)
   target_include_directories(testxgboost PRIVATE ${CUDA_INCLUDE_DIRS})
@@ -63,7 +71,6 @@ endif()
 
 target_include_directories(testxgboost
   PRIVATE
-  ${GTEST_INCLUDE_DIRS}
   ${xgboost_SOURCE_DIR}/include
   ${xgboost_SOURCE_DIR}/dmlc-core/include)
 target_link_libraries(testxgboost
diff --git a/tests/cpp/common/test_partition_builder.cc b/tests/cpp/common/test_partition_builder.cc
index 08dd345f261f..36fb7a8d9870 100644
--- a/tests/cpp/common/test_partition_builder.cc
+++ b/tests/cpp/common/test_partition_builder.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020-2023 by XGBoost contributors
+ * Copyright 2020-2024, XGBoost contributors
  */
 #include <gtest/gtest.h>
 
@@ -58,7 +58,7 @@ TEST(PartitionBuilder, BasicTest) {
   }
   builder.CalculateRowOffsets();
 
-  std::vector<size_t> v(*std::max_element(tasks.begin(), tasks.end()) * kBlockSize);
+  std::vector<bst_idx_t> v(*std::max_element(tasks.begin(), tasks.end()) * kBlockSize);
 
   for(size_t nid = 0; nid < kNodes; ++nid) {
 
diff --git a/tests/cpp/common/test_threadpool.cc b/tests/cpp/common/test_threadpool.cc
index bd54a9dedbe2..ca8a73b55ff6 100644
--- a/tests/cpp/common/test_threadpool.cc
+++ b/tests/cpp/common/test_threadpool.cc
@@ -2,6 +2,7 @@
  * Copyright 2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
+#include <xgboost/global_config.h>  // for GlobalConfigThreadLocalStore
 
 #include <cstddef>  // for size_t
 #include <cstdint>  // for int32_t
@@ -13,7 +14,23 @@
 namespace xgboost::common {
 TEST(ThreadPool, Basic) {
   std::int32_t n_threads = std::thread::hardware_concurrency();
-  ThreadPool pool{n_threads};
+
+  // Set verbosity to 0 for thread-local variable.
+  auto orig = GlobalConfigThreadLocalStore::Get()->verbosity;
+  GlobalConfigThreadLocalStore::Get()->verbosity = 4;
+  // 4 is an invalid value, it's only possible to set it by bypassing the parameter
+  // validation.
+  ASSERT_NE(orig, GlobalConfigThreadLocalStore::Get()->verbosity);
+  ThreadPool pool{n_threads, [config = *GlobalConfigThreadLocalStore::Get()] {
+                    *GlobalConfigThreadLocalStore::Get() = config;
+                  }};
+  GlobalConfigThreadLocalStore::Get()->verbosity = orig;  // restore
+
+  {
+    auto fut = pool.Submit([] { return GlobalConfigThreadLocalStore::Get()->verbosity; });
+    ASSERT_EQ(fut.get(), 4);
+    ASSERT_EQ(GlobalConfigThreadLocalStore::Get()->verbosity, orig);
+  }
   {
     auto fut = pool.Submit([] { return 3; });
     ASSERT_EQ(fut.get(), 3);
@@ -45,5 +62,12 @@ TEST(ThreadPool, Basic) {
       ASSERT_EQ(futures[i].get(), i);
     }
   }
+  {
+    std::int32_t val{0};
+    auto fut = pool.Submit([&] { val = 3; });
+    static_assert(std::is_void_v<decltype(fut.get())>);
+    fut.get();
+    ASSERT_EQ(val, 3);
+  }
 }
 }  // namespace xgboost::common
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cc b/tests/cpp/data/test_sparse_page_dmatrix.cc
index 33308be19385..3aeb42abce2b 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cc
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cc
@@ -118,7 +118,8 @@ TEST(SparsePageDMatrix, RetainSparsePage) {
 // Test GHistIndexMatrix can avoid loading sparse page after the initialization.
 TEST(SparsePageDMatrix, GHistIndexSkipSparsePage) {
   dmlc::TemporaryDirectory tmpdir;
-  auto Xy = RandomDataGenerator{180, 12, 0.0}.Batches(6).GenerateSparsePageDMatrix(
+  std::size_t n_batches = 6;
+  auto Xy = RandomDataGenerator{180, 12, 0.0}.Batches(n_batches).GenerateSparsePageDMatrix(
       tmpdir.path + "/", true);
   Context ctx;
   bst_bin_t n_bins{256};
@@ -171,12 +172,30 @@ TEST(SparsePageDMatrix, GHistIndexSkipSparsePage) {
     // Restore the batch parameter by passing it in again through check_ghist
     check_ghist();
   }
+
   // half the pages
-  auto it = Xy->GetBatches<SparsePage>(&ctx).begin();
-  for (std::int32_t i = 0; i < 3; ++i) {
-    ++it;
+  {
+    auto it = Xy->GetBatches<SparsePage>(&ctx).begin();
+    for (std::size_t i = 0; i < n_batches / 2; ++i) {
+      ++it;
+    }
+    check_ghist();
+  }
+  {
+    auto it = Xy->GetBatches<GHistIndexMatrix>(&ctx, batch_param).begin();
+    for (std::size_t i = 0; i < n_batches / 2; ++i) {
+      ++it;
+    }
+    check_ghist();
+  }
+  {
+    BatchParam regen{n_bins, common::Span{hess.data(), hess.size()}, true};
+    auto it = Xy->GetBatches<GHistIndexMatrix>(&ctx, regen).begin();
+    for (std::size_t i = 0; i < n_batches / 2; ++i) {
+      ++it;
+    }
+    check_ghist();
   }
-  check_ghist();
 }
 
 TEST(SparsePageDMatrix, MetaInfo) {
diff --git a/tests/cpp/data/test_sparse_page_dmatrix.cu b/tests/cpp/data/test_sparse_page_dmatrix.cu
index 7200b96a919c..327f2ba635fd 100644
--- a/tests/cpp/data/test_sparse_page_dmatrix.cu
+++ b/tests/cpp/data/test_sparse_page_dmatrix.cu
@@ -41,31 +41,77 @@ TEST(SparsePageDMatrix, EllpackPage) {
 TEST(SparsePageDMatrix, EllpackSkipSparsePage) {
   // Test Ellpack can avoid loading sparse page after the initialization.
   dmlc::TemporaryDirectory tmpdir;
-  auto Xy = RandomDataGenerator{180, 12, 0.0}.Batches(6).GenerateSparsePageDMatrix(
+  std::size_t n_batches = 6;
+  auto Xy = RandomDataGenerator{180, 12, 0.0}.Batches(n_batches).GenerateSparsePageDMatrix(
       tmpdir.path + "/", true);
   auto ctx = MakeCUDACtx(0);
+  auto cpu = ctx.MakeCPU();
   bst_bin_t n_bins{256};
   double sparse_thresh{0.8};
   BatchParam batch_param{n_bins, sparse_thresh};
 
-  std::int32_t k = 0;
-  for (auto const& page : Xy->GetBatches<EllpackPage>(&ctx, batch_param)) {
-    auto impl = page.Impl();
-    ASSERT_EQ(page.Size(), 30);
-    ASSERT_EQ(k, impl->base_rowid);
-    k += page.Size();
-  }
+  auto check_ellpack = [&]() {
+    std::int32_t k = 0;
+    for (auto const& page : Xy->GetBatches<EllpackPage>(&ctx, batch_param)) {
+      auto impl = page.Impl();
+      ASSERT_EQ(page.Size(), 30);
+      ASSERT_EQ(k, impl->base_rowid);
+      k += page.Size();
+    }
+  };
 
   auto casted = std::dynamic_pointer_cast<data::SparsePageDMatrix>(Xy);
   CHECK(casted);
+  check_ellpack();
+
   // Make the number of fetches don't change (no new fetch)
   auto n_fetches = casted->SparsePageFetchCount();
-  for (std::int32_t i = 0; i < 3; ++i) {
+  for (std::size_t i = 0; i < 3; ++i) {
     for ([[maybe_unused]] auto const& page : Xy->GetBatches<EllpackPage>(&ctx, batch_param)) {
     }
     auto casted = std::dynamic_pointer_cast<data::SparsePageDMatrix>(Xy);
     ASSERT_EQ(casted->SparsePageFetchCount(), n_fetches);
   }
+  check_ellpack();
+
+  dh::device_vector<float> hess(Xy->Info().num_row_, 1.0f);
+  for (std::size_t i = 0; i < 4; ++i) {
+    for ([[maybe_unused]] auto const& page : Xy->GetBatches<SparsePage>(&ctx)) {
+    }
+    for ([[maybe_unused]] auto const& page : Xy->GetBatches<SortedCSCPage>(&cpu)) {
+    }
+    for ([[maybe_unused]] auto const& page : Xy->GetBatches<EllpackPage>(&ctx, batch_param)) {
+    }
+    // Approx tree method pages
+    {
+      BatchParam regen{n_bins, dh::ToSpan(hess), false};
+      for ([[maybe_unused]] auto const& page : Xy->GetBatches<EllpackPage>(&ctx, regen)) {
+      }
+    }
+    {
+      BatchParam regen{n_bins, dh::ToSpan(hess), true};
+      for ([[maybe_unused]] auto const& page : Xy->GetBatches<EllpackPage>(&ctx, regen)) {
+      }
+    }
+
+    check_ellpack();
+  }
+
+  // half the pages
+  {
+    auto it = Xy->GetBatches<SparsePage>(&ctx).begin();
+    for (std::size_t i = 0; i < n_batches / 2; ++i) {
+      ++it;
+    }
+    check_ellpack();
+  }
+  {
+    auto it = Xy->GetBatches<EllpackPage>(&ctx, batch_param).begin();
+    for (std::size_t i = 0; i < n_batches / 2; ++i) {
+      ++it;
+    }
+    check_ellpack();
+  }
 }
 
 TEST(SparsePageDMatrix, MultipleEllpackPages) {
@@ -115,12 +161,7 @@ TEST(SparsePageDMatrix, RetainEllpackPage) {
 
   for (size_t i = 0; i < iterators.size(); ++i) {
     ASSERT_EQ((*iterators[i]).Impl()->gidx_buffer.HostVector(), gidx_buffers.at(i).HostVector());
-    if (i != iterators.size() - 1) {
-      ASSERT_EQ(iterators[i].use_count(), 1);
-    } else {
-      // The last batch is still being held by sparse page DMatrix.
-      ASSERT_EQ(iterators[i].use_count(), 2);
-    }
+    ASSERT_EQ(iterators[i].use_count(), 1);
   }
 
   // make sure it's const and the caller can not modify the content of page.
diff --git a/tests/cpp/plugin/federated/CMakeLists.txt b/tests/cpp/plugin/federated/CMakeLists.txt
new file mode 100644
index 000000000000..f85304e31ac1
--- /dev/null
+++ b/tests/cpp/plugin/federated/CMakeLists.txt
@@ -0,0 +1,20 @@
+target_sources(
+  testxgboost PRIVATE
+  ${xgboost_SOURCE_DIR}/tests/cpp/plugin/federated/test_federated_coll.cc
+  ${xgboost_SOURCE_DIR}/tests/cpp/plugin/federated/test_federated_comm.cc
+  ${xgboost_SOURCE_DIR}/tests/cpp/plugin/federated/test_federated_comm_group.cc
+  ${xgboost_SOURCE_DIR}/tests/cpp/plugin/federated/test_federated_tracker.cc
+  ${xgboost_SOURCE_DIR}/tests/cpp/plugin/federated/test_federated_learner.cc
+  ${xgboost_SOURCE_DIR}/tests/cpp/plugin/federated/test_federated_data.cc
+)
+
+if(USE_CUDA)
+  target_sources(
+    testxgboost PRIVATE
+    ${xgboost_SOURCE_DIR}/tests/cpp/plugin/federated/test_federated_coll.cu
+    ${xgboost_SOURCE_DIR}/tests/cpp/plugin/federated/test_federated_comm_group.cu
+  )
+endif()
+
+target_include_directories(testxgboost PRIVATE ${xgboost_SOURCE_DIR}/plugin/federated)
+target_link_libraries(testxgboost PRIVATE federated_client)
diff --git a/tests/cpp/plugin/federated/test_federated_learner.cc b/tests/cpp/plugin/federated/test_federated_learner.cc
index b14845cbcfc2..ed0bbcb3b749 100644
--- a/tests/cpp/plugin/federated/test_federated_learner.cc
+++ b/tests/cpp/plugin/federated/test_federated_learner.cc
@@ -1,8 +1,7 @@
 /**
  * Copyright 2023-2024, XGBoost contributors
  *
- * Some other tests for federated learning are in the main test suite (test_learner.cc),
- * gaurded by the `XGBOOST_USE_FEDERATED`.
+ * Some other tests for federated learning are in the main test suite (test_learner.cc).
  */
 #include <dmlc/parameter.h>
 #include <gtest/gtest.h>
diff --git a/tests/cpp/tree/gpu_hist/test_histogram.cu b/tests/cpp/tree/gpu_hist/test_histogram.cu
index 84cd956db094..d1128446617b 100644
--- a/tests/cpp/tree/gpu_hist/test_histogram.cu
+++ b/tests/cpp/tree/gpu_hist/test_histogram.cu
@@ -1,19 +1,20 @@
 /**
- * Copyright 2020-2023, XGBoost Contributors
+ * Copyright 2020-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
+#include <xgboost/context.h>  // for Context
 
-#include <vector>
+#include <memory>  // for unique_ptr
+#include <vector>  // for vector
 
-#include "../../../../src/common/categorical.h"
 #include "../../../../src/tree/gpu_hist/histogram.cuh"
-#include "../../../../src/tree/gpu_hist/row_partitioner.cuh"
-#include "../../../../src/tree/param.h"  // TrainParam
-#include "../../categorical_helpers.h"
+#include "../../../../src/tree/gpu_hist/row_partitioner.cuh"  // for RowPartitioner
+#include "../../../../src/tree/param.h"                       // for TrainParam
+#include "../../categorical_helpers.h"                        // for OneHotEncodeFeature
 #include "../../helpers.h"
 
 namespace xgboost::tree {
-void TestDeterministicHistogram(bool is_dense, int shm_size) {
+void TestDeterministicHistogram(bool is_dense, int shm_size, bool force_global) {
   Context ctx = MakeCUDACtx(0);
   size_t constexpr kBins = 256, kCols = 120, kRows = 16384, kRounds = 16;
   float constexpr kLower = -1e-2, kUpper = 1e2;
@@ -25,35 +26,37 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
   for (auto const& batch : matrix->GetBatches<EllpackPage>(&ctx, batch_param)) {
     auto* page = batch.Impl();
 
-    tree::RowPartitioner row_partitioner(FstCU(), kRows);
+    tree::RowPartitioner row_partitioner{&ctx, kRows, page->base_rowid};
     auto ridx = row_partitioner.GetRows(0);
 
-    int num_bins = kBins * kCols;
+    bst_bin_t num_bins = kBins * kCols;
     dh::device_vector<GradientPairInt64> histogram(num_bins);
     auto d_histogram = dh::ToSpan(histogram);
     auto gpair = GenerateRandomGradients(kRows, kLower, kUpper);
-    gpair.SetDevice(FstCU());
+    gpair.SetDevice(ctx.Device());
 
-    FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size,
-                                 sizeof(GradientPairInt64));
+    FeatureGroups feature_groups(page->Cuts(), page->is_dense, shm_size, sizeof(GradientPairInt64));
 
     auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo());
-    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()),
-                           feature_groups.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx,
+    DeviceHistogramBuilder builder;
+    builder.Reset(&ctx, feature_groups.DeviceAccessor(ctx.Device()), force_global);
+    builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+                           feature_groups.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                            d_histogram, quantiser);
 
     std::vector<GradientPairInt64> histogram_h(num_bins);
     dh::safe_cuda(cudaMemcpy(histogram_h.data(), d_histogram.data(),
-                             num_bins * sizeof(GradientPairInt64),
-                             cudaMemcpyDeviceToHost));
+                             num_bins * sizeof(GradientPairInt64), cudaMemcpyDeviceToHost));
 
-    for (size_t i = 0; i < kRounds; ++i) {
+    for (std::size_t i = 0; i < kRounds; ++i) {
       dh::device_vector<GradientPairInt64> new_histogram(num_bins);
       auto d_new_histogram = dh::ToSpan(new_histogram);
 
       auto quantiser = GradientQuantiser(&ctx, gpair.DeviceSpan(), MetaInfo());
-      BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()),
-                             feature_groups.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx,
+      DeviceHistogramBuilder builder;
+      builder.Reset(&ctx, feature_groups.DeviceAccessor(ctx.Device()), force_global);
+      builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+                             feature_groups.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                              d_new_histogram, quantiser);
 
       std::vector<GradientPairInt64> new_histogram_h(num_bins);
@@ -68,14 +71,16 @@ void TestDeterministicHistogram(bool is_dense, int shm_size) {
 
     {
       auto gpair = GenerateRandomGradients(kRows, kLower, kUpper);
-      gpair.SetDevice(FstCU());
+      gpair.SetDevice(ctx.Device());
 
       // Use a single feature group to compute the baseline.
       FeatureGroups single_group(page->Cuts());
 
       dh::device_vector<GradientPairInt64> baseline(num_bins);
-      BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(FstCU()),
-                             single_group.DeviceAccessor(FstCU()), gpair.DeviceSpan(), ridx,
+      DeviceHistogramBuilder builder;
+      builder.Reset(&ctx, single_group.DeviceAccessor(ctx.Device()), force_global);
+      builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+                             single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                              dh::ToSpan(baseline), quantiser);
 
       std::vector<GradientPairInt64> baseline_h(num_bins);
@@ -96,7 +101,9 @@ TEST(Histogram, GPUDeterministic) {
   std::vector<int> shm_sizes{48 * 1024, 64 * 1024, 160 * 1024};
   for (bool is_dense : is_dense_array) {
     for (int shm_size : shm_sizes) {
-      TestDeterministicHistogram(is_dense, shm_size);
+      for (bool force_global : {true, false}) {
+        TestDeterministicHistogram(is_dense, shm_size, force_global);
+      }
     }
   }
 }
@@ -124,7 +131,7 @@ void TestGPUHistogramCategorical(size_t num_categories) {
   auto cat_m = GetDMatrixFromData(x, kRows, 1);
   cat_m->Info().feature_types.HostVector().push_back(FeatureType::kCategorical);
   auto batch_param = BatchParam{kBins, tree::TrainParam::DftSparseThreshold()};
-  tree::RowPartitioner row_partitioner(ctx.Device(), kRows);
+  tree::RowPartitioner row_partitioner{&ctx, kRows, 0};
   auto ridx = row_partitioner.GetRows(0);
   dh::device_vector<GradientPairInt64> cat_hist(num_categories);
   auto gpair = GenerateRandomGradients(kRows, 0, 2);
@@ -136,7 +143,9 @@ void TestGPUHistogramCategorical(size_t num_categories) {
   for (auto const &batch : cat_m->GetBatches<EllpackPage>(&ctx, batch_param)) {
     auto* page = batch.Impl();
     FeatureGroups single_group(page->Cuts());
-    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+    DeviceHistogramBuilder builder;
+    builder.Reset(&ctx, single_group.DeviceAccessor(ctx.Device()), false);
+    builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
                            single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                            dh::ToSpan(cat_hist), quantiser);
   }
@@ -150,7 +159,9 @@ void TestGPUHistogramCategorical(size_t num_categories) {
   for (auto const &batch : encode_m->GetBatches<EllpackPage>(&ctx, batch_param)) {
     auto* page = batch.Impl();
     FeatureGroups single_group(page->Cuts());
-    BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+    DeviceHistogramBuilder builder;
+    builder.Reset(&ctx, single_group.DeviceAccessor(ctx.Device()), false);
+    builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
                            single_group.DeviceAccessor(ctx.Device()), gpair.DeviceSpan(), ridx,
                            dh::ToSpan(encode_hist), quantiser);
   }
@@ -253,4 +264,105 @@ TEST(Histogram, Quantiser) {
     ASSERT_EQ(gh.GetHess(), 1.0);
   }
 }
+namespace {
+class HistogramExternalMemoryTest : public ::testing::TestWithParam<std::tuple<float, bool>> {
+ public:
+  void Run(float sparsity, bool force_global) {
+    bst_idx_t n_samples{512}, n_features{12}, n_batches{3};
+    std::vector<std::unique_ptr<RowPartitioner>> partitioners;
+    auto p_fmat = RandomDataGenerator{n_samples, n_features, sparsity}
+                      .Batches(n_batches)
+                      .GenerateSparsePageDMatrix("cache", true);
+    bst_bin_t n_bins = 16;
+    BatchParam p{n_bins, TrainParam::DftSparseThreshold()};
+    auto ctx = MakeCUDACtx(0);
+
+    std::unique_ptr<FeatureGroups> fg;
+    dh::device_vector<GradientPairInt64> single_hist;
+    dh::device_vector<GradientPairInt64> multi_hist;
+
+    auto gpair = GenerateRandomGradients(n_samples);
+    gpair.SetDevice(ctx.Device());
+    auto quantiser = GradientQuantiser{&ctx, gpair.ConstDeviceSpan(), p_fmat->Info()};
+    std::shared_ptr<common::HistogramCuts> cuts;
+
+    {
+      /**
+       * Multi page.
+       */
+      std::int32_t k{0};
+      for (auto const& page : p_fmat->GetBatches<EllpackPage>(&ctx, p)) {
+        auto impl = page.Impl();
+        if (k == 0) {
+          // Initialization
+          auto d_matrix = impl->GetDeviceAccessor(ctx.Device());
+          fg = std::make_unique<FeatureGroups>(impl->Cuts());
+          auto init = GradientPairInt64{0, 0};
+          multi_hist = decltype(multi_hist)(impl->Cuts().TotalBins(), init);
+          single_hist = decltype(single_hist)(impl->Cuts().TotalBins(), init);
+          cuts = std::make_shared<common::HistogramCuts>(impl->Cuts());
+        }
+
+        partitioners.emplace_back(
+            std::make_unique<RowPartitioner>(&ctx, impl->Size(), impl->base_rowid));
+
+        auto ridx = partitioners.at(k)->GetRows(0);
+        auto d_histogram = dh::ToSpan(multi_hist);
+        DeviceHistogramBuilder builder;
+        builder.Reset(&ctx, fg->DeviceAccessor(ctx.Device()), force_global);
+        builder.BuildHistogram(ctx.CUDACtx(), impl->GetDeviceAccessor(ctx.Device()),
+                               fg->DeviceAccessor(ctx.Device()), gpair.ConstDeviceSpan(), ridx,
+                               d_histogram, quantiser);
+        ++k;
+      }
+      ASSERT_EQ(k, n_batches);
+    }
+
+    {
+      /**
+       * Single page.
+       */
+      RowPartitioner partitioner{&ctx, p_fmat->Info().num_row_, 0};
+      SparsePage concat;
+      std::vector<float> hess(p_fmat->Info().num_row_, 1.0f);
+      for (auto const& page : p_fmat->GetBatches<SparsePage>()) {
+        concat.Push(page);
+      }
+      EllpackPageImpl page{
+          ctx.Device(), cuts, concat, p_fmat->IsDense(), p_fmat->Info().num_col_, {}};
+      auto ridx = partitioner.GetRows(0);
+      auto d_histogram = dh::ToSpan(single_hist);
+      DeviceHistogramBuilder builder;
+      builder.Reset(&ctx, fg->DeviceAccessor(ctx.Device()), force_global);
+      builder.BuildHistogram(ctx.CUDACtx(), page.GetDeviceAccessor(ctx.Device()),
+                             fg->DeviceAccessor(ctx.Device()), gpair.ConstDeviceSpan(), ridx,
+                             d_histogram, quantiser);
+    }
+
+    std::vector<GradientPairInt64> h_single(single_hist.size());
+    thrust::copy(single_hist.begin(), single_hist.end(), h_single.begin());
+    std::vector<GradientPairInt64> h_multi(multi_hist.size());
+    thrust::copy(multi_hist.begin(), multi_hist.end(), h_multi.begin());
+
+    for (std::size_t i = 0; i < single_hist.size(); ++i) {
+      ASSERT_EQ(h_single[i].GetQuantisedGrad(), h_multi[i].GetQuantisedGrad());
+      ASSERT_EQ(h_single[i].GetQuantisedHess(), h_multi[i].GetQuantisedHess());
+    }
+  }
+};
+}  // namespace
+
+TEST_P(HistogramExternalMemoryTest, ExternalMemory) {
+  std::apply(&HistogramExternalMemoryTest::Run, std::tuple_cat(std::make_tuple(this), GetParam()));
+}
+
+INSTANTIATE_TEST_SUITE_P(Histogram, HistogramExternalMemoryTest, ::testing::ValuesIn([]() {
+                           std::vector<std::tuple<float, bool>> params;
+                           for (auto global : {true, false}) {
+                             for (auto sparsity : {0.0f, 0.2f, 0.8f}) {
+                               params.emplace_back(sparsity, global);
+                             }
+                           }
+                           return params;
+                         }()));
 }  // namespace xgboost::tree
diff --git a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
index 14ea6fd70a4e..cf0d505d103d 100644
--- a/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
+++ b/tests/cpp/tree/gpu_hist/test_row_partitioner.cu
@@ -1,25 +1,22 @@
-/*!
- * Copyright 2019-2022 by XGBoost Contributors
+/**
+ * Copyright 2019-2024, XGBoost Contributors
  */
 #include <gtest/gtest.h>
 #include <thrust/device_vector.h>
-#include <thrust/host_vector.h>
-#include <thrust/sequence.h>
 
-#include <algorithm>
-#include <vector>
+#include <cstddef>  // for size_t
+#include <cstdint>  // for uint32_t
+#include <vector>   // for vector
 
 #include "../../../../src/tree/gpu_hist/row_partitioner.cuh"
 #include "../../helpers.h"
 #include "xgboost/base.h"
-#include "xgboost/context.h"
-#include "xgboost/task.h"
-#include "xgboost/tree_model.h"
 
 namespace xgboost::tree {
 void TestUpdatePositionBatch() {
   const int kNumRows = 10;
-  RowPartitioner rp(FstCU(), kNumRows);
+  auto ctx = MakeCUDACtx(0);
+  RowPartitioner rp{&ctx, kNumRows, 0};
   auto rows = rp.GetRowsHost(0);
   EXPECT_EQ(rows.size(), kNumRows);
   for (auto i = 0ull; i < kNumRows; i++) {
diff --git a/tests/cpp/tree/hist/test_evaluate_splits.cc b/tests/cpp/tree/hist/test_evaluate_splits.cc
index 1ac60bbe8268..a269d237fe3c 100644
--- a/tests/cpp/tree/hist/test_evaluate_splits.cc
+++ b/tests/cpp/tree/hist/test_evaluate_splits.cc
@@ -49,7 +49,7 @@ void TestEvaluateSplits(bool force_read_by_column) {
   // dense, no missing values
   GHistIndexMatrix gmat(&ctx, dmat.get(), kMaxBins, 0.5, false);
   common::RowSetCollection row_set_collection;
-  std::vector<size_t> &row_indices = *row_set_collection.Data();
+  std::vector<bst_idx_t> &row_indices = *row_set_collection.Data();
   row_indices.resize(kRows);
   std::iota(row_indices.begin(), row_indices.end(), 0);
   row_set_collection.Init();
@@ -57,7 +57,9 @@ void TestEvaluateSplits(bool force_read_by_column) {
   HistMakerTrainParam hist_param;
   hist.Reset(gmat.cut.Ptrs().back(), hist_param.max_cached_hist_node);
   hist.AllocateHistograms({0});
-  common::BuildHist<false>(row_gpairs, row_set_collection[0], gmat, hist[0], force_read_by_column);
+  auto const &elem = row_set_collection[0];
+  common::BuildHist<false>(row_gpairs, common::Span{elem.begin(), elem.end()}, gmat, hist[0],
+                           force_read_by_column);
 
   // Compute total gradient for all data points
   GradientPairPrecise total_gpair;
@@ -319,7 +321,7 @@ void DoTestEvaluateSplitsSecure(bool force_read_by_column) {
   // dense, no missing values
   GHistIndexMatrix gmat(&ctx, dmat.get(), kMaxBins, 0.5, false);
   common::RowSetCollection row_set_collection;
-  std::vector<size_t> &row_indices = *row_set_collection.Data();
+  auto &row_indices = *row_set_collection.Data();
   row_indices.resize(kRows);
   std::iota(row_indices.begin(), row_indices.end(), 0);
   row_set_collection.Init();
@@ -327,7 +329,9 @@ void DoTestEvaluateSplitsSecure(bool force_read_by_column) {
   HistMakerTrainParam hist_param;
   hist.Reset(gmat.cut.Ptrs().back(), hist_param.max_cached_hist_node);
   hist.AllocateHistograms({0});
-  common::BuildHist<false>(row_gpairs, row_set_collection[0], gmat, hist[0], force_read_by_column);
+  common::BuildHist<false>(row_gpairs,
+                           common::Span{row_set_collection[0].begin(), row_set_collection[0].end()},
+                           gmat, hist[0], force_read_by_column);
 
   // Compute total gradient for all data points
   GradientPairPrecise total_gpair;
@@ -336,9 +340,7 @@ void DoTestEvaluateSplitsSecure(bool force_read_by_column) {
   }
 
   RegTree tree;
-  std::vector<CPUExpandEntry> entries(1);
-  entries.front().nid = 0;
-  entries.front().depth = 0;
+  std::vector<CPUExpandEntry> entries(1, CPUExpandEntry{0, 1});
 
   evaluator.InitRoot(GradStats{total_gpair});
   evaluator.EvaluateSplits(hist, gmat.cut, {}, tree, &entries);
diff --git a/tests/cpp/tree/hist/test_histogram.cc b/tests/cpp/tree/hist/test_histogram.cc
index 9d13c47aaa02..8a6b27e7d946 100644
--- a/tests/cpp/tree/hist/test_histogram.cc
+++ b/tests/cpp/tree/hist/test_histogram.cc
@@ -14,7 +14,6 @@
 #include <algorithm>   // for max
 #include <cstddef>     // for size_t
 #include <cstdint>     // for int32_t, uint32_t
-#include <functional>  // for function
 #include <iterator>    // for back_inserter
 #include <limits>      // for numeric_limits
 #include <memory>      // for shared_ptr, allocator, unique_ptr
@@ -108,7 +107,7 @@ void TestSyncHist(bool is_distributed) {
   common::RowSetCollection row_set_collection;
   {
     row_set_collection.Clear();
-    std::vector<size_t> &row_indices = *row_set_collection.Data();
+    std::vector<bst_idx_t> &row_indices = *row_set_collection.Data();
     row_indices.resize(kNRows);
     std::iota(row_indices.begin(), row_indices.end(), 0);
     row_set_collection.Init();
@@ -251,7 +250,7 @@ void TestBuildHistogram(bool is_distributed, bool force_read_by_column, bool is_
 
   common::RowSetCollection row_set_collection;
   row_set_collection.Clear();
-  std::vector<size_t> &row_indices = *row_set_collection.Data();
+  std::vector<bst_idx_t> &row_indices = *row_set_collection.Data();
   row_indices.resize(kNRows);
   std::iota(row_indices.begin(), row_indices.end(), 0);
   row_set_collection.Init();
@@ -367,7 +366,7 @@ void TestHistogramCategorical(size_t n_categories, bool force_read_by_column) {
 
   common::RowSetCollection row_set_collection;
   row_set_collection.Clear();
-  std::vector<size_t> &row_indices = *row_set_collection.Data();
+  std::vector<bst_idx_t> &row_indices = *row_set_collection.Data();
   row_indices.resize(kRows);
   std::iota(row_indices.begin(), row_indices.end(), 0);
   row_set_collection.Init();
diff --git a/tests/cpp/tree/test_approx.cc b/tests/cpp/tree/test_approx.cc
index b2949e5952a2..d647d3a970bf 100644
--- a/tests/cpp/tree/test_approx.cc
+++ b/tests/cpp/tree/test_approx.cc
@@ -3,7 +3,6 @@
  */
 #include <gtest/gtest.h>
 
-#include "../../../src/common/numeric.h"
 #include "../../../src/tree/common_row_partitioner.h"
 #include "../collective/test_worker.h"  // for TestDistributedGlobal
 #include "../helpers.h"
@@ -54,20 +53,23 @@ TEST(Approx, Partitioner) {
       GetSplit(&tree, split_value, &candidates);
       partitioner.UpdatePosition(&ctx, page, candidates, &tree);
 
-      auto left_nidx = tree[RegTree::kRoot].LeftChild();
-      auto elem = partitioner[left_nidx];
-      ASSERT_LT(elem.Size(), n_samples);
-      ASSERT_GT(elem.Size(), 1);
-      for (auto it = elem.begin; it != elem.end; ++it) {
-        auto value = page.cut.Values().at(page.index[*it]);
-        ASSERT_LE(value, split_value);
+      {
+        auto left_nidx = tree[RegTree::kRoot].LeftChild();
+        auto const& elem = partitioner[left_nidx];
+        ASSERT_LT(elem.Size(), n_samples);
+        ASSERT_GT(elem.Size(), 1);
+        for (auto& it : elem) {
+          auto value = page.cut.Values().at(page.index[it]);
+          ASSERT_LE(value, split_value);
+        }
       }
-
-      auto right_nidx = tree[RegTree::kRoot].RightChild();
-      elem = partitioner[right_nidx];
-      for (auto it = elem.begin; it != elem.end; ++it) {
-        auto value = page.cut.Values().at(page.index[*it]);
-        ASSERT_GT(value, split_value) << *it;
+      {
+        auto right_nidx = tree[RegTree::kRoot].RightChild();
+        auto const& elem = partitioner[right_nidx];
+        for (auto& it : elem) {
+          auto value = page.cut.Values().at(page.index[it]);
+          ASSERT_GT(value, split_value) << it;
+        }
       }
     }
   }
@@ -99,23 +101,25 @@ void TestColumnSplitPartitioner(size_t n_samples, size_t base_rowid, std::shared
       RegTree tree;
       GetSplit(&tree, mid_value, &candidates);
       partitioner.UpdatePosition(&ctx, page, candidates, &tree);
-
-      auto left_nidx = tree[RegTree::kRoot].LeftChild();
-      auto elem = partitioner[left_nidx];
-      ASSERT_LT(elem.Size(), n_samples);
-      ASSERT_GT(elem.Size(), 1);
-      auto expected_elem = expected_mid_partitioner[left_nidx];
-      ASSERT_EQ(elem.Size(), expected_elem.Size());
-      for (auto it = elem.begin, eit = expected_elem.begin; it != elem.end; ++it, ++eit) {
-        ASSERT_EQ(*it, *eit);
+      {
+        auto left_nidx = tree[RegTree::kRoot].LeftChild();
+        auto const& elem = partitioner[left_nidx];
+        ASSERT_LT(elem.Size(), n_samples);
+        ASSERT_GT(elem.Size(), 1);
+        auto const& expected_elem = expected_mid_partitioner[left_nidx];
+        ASSERT_EQ(elem.Size(), expected_elem.Size());
+        for (auto it = elem.begin(), eit = expected_elem.begin(); it != elem.end(); ++it, ++eit) {
+          ASSERT_EQ(*it, *eit);
+        }
       }
-
-      auto right_nidx = tree[RegTree::kRoot].RightChild();
-      elem = partitioner[right_nidx];
-      expected_elem = expected_mid_partitioner[right_nidx];
-      ASSERT_EQ(elem.Size(), expected_elem.Size());
-      for (auto it = elem.begin, eit = expected_elem.begin; it != elem.end; ++it, ++eit) {
-        ASSERT_EQ(*it, *eit);
+      {
+        auto right_nidx = tree[RegTree::kRoot].RightChild();
+        auto const& elem = partitioner[right_nidx];
+        auto const& expected_elem = expected_mid_partitioner[right_nidx];
+        ASSERT_EQ(elem.Size(), expected_elem.Size());
+        for (auto it = elem.begin(), eit = expected_elem.begin(); it != elem.end(); ++it, ++eit) {
+          ASSERT_EQ(*it, *eit);
+        }
       }
     }
   }
diff --git a/tests/cpp/tree/test_gpu_hist.cu b/tests/cpp/tree/test_gpu_hist.cu
index cc4d9fb7fdad..200fb39fb4e9 100644
--- a/tests/cpp/tree/test_gpu_hist.cu
+++ b/tests/cpp/tree/test_gpu_hist.cu
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2023 by XGBoost contributors
+ * Copyright 2017-2024, XGBoost contributors
  */
 #include <gtest/gtest.h>
 #include <thrust/device_vector.h>
@@ -22,12 +22,8 @@
 #include "xgboost/context.h"
 #include "xgboost/json.h"
 
-#if defined(XGBOOST_USE_FEDERATED)
-#include "../plugin/federated/test_worker.h"  // for TestFederatedGlobal
-#endif  // defined(XGBOOST_USE_FEDERATED)
-
 namespace xgboost::tree {
-TEST(GpuHist, DeviceHistogram) {
+TEST(GpuHist, DeviceHistogramStorage) {
   // Ensures that node allocates correctly after reaching `kStopGrowingSize`.
   dh::safe_cuda(cudaSetDevice(0));
   constexpr size_t kNBins = 128;
@@ -102,17 +98,17 @@ void TestBuildHist(bool use_shared_memory_histograms) {
   xgboost::SimpleLCG gen;
   xgboost::SimpleRealUniformDistribution<bst_float> dist(0.0f, 1.0f);
   HostDeviceVector<GradientPair> gpair(kNRows);
-  for (auto &gp : gpair.HostVector()) {
-    bst_float grad = dist(&gen);
-    bst_float hess = dist(&gen);
-    gp = GradientPair(grad, hess);
+  for (auto& gp : gpair.HostVector()) {
+    float grad = dist(&gen);
+    float hess = dist(&gen);
+    gp = GradientPair{grad, hess};
   }
-  gpair.SetDevice(DeviceOrd::CUDA(0));
+  gpair.SetDevice(ctx.Device());
 
-  thrust::host_vector<common::CompressedByteT> h_gidx_buffer (page->gidx_buffer.HostVector());
-  maker.row_partitioner = std::make_unique<RowPartitioner>(FstCU(), kNRows);
+  thrust::host_vector<common::CompressedByteT> h_gidx_buffer(page->gidx_buffer.HostVector());
+  maker.row_partitioner = std::make_unique<RowPartitioner>(&ctx, kNRows, 0);
 
-  maker.hist.Init(FstCU(), page->Cuts().TotalBins());
+  maker.hist.Init(ctx.Device(), page->Cuts().TotalBins());
   maker.hist.AllocateHistograms({0});
 
   maker.gpair = gpair.DeviceSpan();
@@ -121,10 +117,13 @@ void TestBuildHist(bool use_shared_memory_histograms) {
 
   maker.InitFeatureGroupsOnce();
 
-  BuildGradientHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(DeviceOrd::CUDA(0)),
-                         maker.feature_groups->DeviceAccessor(DeviceOrd::CUDA(0)), gpair.DeviceSpan(),
+  DeviceHistogramBuilder builder;
+  builder.Reset(&ctx, maker.feature_groups->DeviceAccessor(ctx.Device()),
+                !use_shared_memory_histograms);
+  builder.BuildHistogram(ctx.CUDACtx(), page->GetDeviceAccessor(ctx.Device()),
+                         maker.feature_groups->DeviceAccessor(ctx.Device()), gpair.DeviceSpan(),
                          maker.row_partitioner->GetRows(0), maker.hist.GetNodeHistogram(0),
-                         *maker.quantiser, !use_shared_memory_histograms);
+                         *maker.quantiser);
 
   DeviceHistogramStorage<>& d_hist = maker.hist;
 
diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc
index ce637caa4d46..29ae02f8d2b2 100644
--- a/tests/cpp/tree/test_quantile_hist.cc
+++ b/tests/cpp/tree/test_quantile_hist.cc
@@ -5,7 +5,6 @@
 #include <xgboost/host_device_vector.h>
 #include <xgboost/tree_updater.h>
 
-#include <algorithm>
 #include <cstddef>  // for size_t
 #include <string>
 #include <vector>
@@ -68,21 +67,24 @@ void TestPartitioner(bst_target_t n_targets) {
       } else {
         GetMultiSplitForTest(&tree, split_value, &candidates);
       }
-      auto left_nidx = tree.LeftChild(RegTree::kRoot);
       partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);
-
-      auto elem = partitioner[left_nidx];
-      ASSERT_LT(elem.Size(), n_samples);
-      ASSERT_GT(elem.Size(), 1);
-      for (auto it = elem.begin; it != elem.end; ++it) {
-        auto value = gmat.cut.Values().at(gmat.index[*it]);
-        ASSERT_LE(value, split_value);
+      {
+        auto left_nidx = tree.LeftChild(RegTree::kRoot);
+        auto const& elem = partitioner[left_nidx];
+        ASSERT_LT(elem.Size(), n_samples);
+        ASSERT_GT(elem.Size(), 1);
+        for (auto& it : elem) {
+          auto value = gmat.cut.Values().at(gmat.index[it]);
+          ASSERT_LE(value, split_value);
+        }
       }
-      auto right_nidx = tree.RightChild(RegTree::kRoot);
-      elem = partitioner[right_nidx];
-      for (auto it = elem.begin; it != elem.end; ++it) {
-        auto value = gmat.cut.Values().at(gmat.index[*it]);
-        ASSERT_GT(value, split_value);
+      {
+        auto right_nidx = tree.RightChild(RegTree::kRoot);
+        auto const& elem = partitioner[right_nidx];
+        for (auto& it : elem) {
+          auto value = gmat.cut.Values().at(gmat.index[it]);
+          ASSERT_GT(value, split_value);
+        }
       }
     }
   }
@@ -138,21 +140,24 @@ void VerifyColumnSplitPartitioner(bst_target_t n_targets, size_t n_samples,
       auto left_nidx = tree.LeftChild(RegTree::kRoot);
       partitioner.UpdatePosition<false, true>(&ctx, gmat, column_indices, candidates, &tree);
 
-      auto elem = partitioner[left_nidx];
-      ASSERT_LT(elem.Size(), n_samples);
-      ASSERT_GT(elem.Size(), 1);
-      auto expected_elem = expected_mid_partitioner[left_nidx];
-      ASSERT_EQ(elem.Size(), expected_elem.Size());
-      for (auto it = elem.begin, eit = expected_elem.begin; it != elem.end; ++it, ++eit) {
-        ASSERT_EQ(*it, *eit);
+      {
+        auto const& elem = partitioner[left_nidx];
+        ASSERT_LT(elem.Size(), n_samples);
+        ASSERT_GT(elem.Size(), 1);
+        auto const& expected_elem = expected_mid_partitioner[left_nidx];
+        ASSERT_EQ(elem.Size(), expected_elem.Size());
+        for (auto it = elem.begin(), eit = expected_elem.begin(); it != elem.end(); ++it, ++eit) {
+          ASSERT_EQ(*it, *eit);
+        }
       }
-
-      auto right_nidx = tree.RightChild(RegTree::kRoot);
-      elem = partitioner[right_nidx];
-      expected_elem = expected_mid_partitioner[right_nidx];
-      ASSERT_EQ(elem.Size(), expected_elem.Size());
-      for (auto it = elem.begin, eit = expected_elem.begin; it != elem.end; ++it, ++eit) {
-        ASSERT_EQ(*it, *eit);
+      {
+        auto right_nidx = tree.RightChild(RegTree::kRoot);
+        auto const& elem = partitioner[right_nidx];
+        auto const& expected_elem = expected_mid_partitioner[right_nidx];
+        ASSERT_EQ(elem.Size(), expected_elem.Size());
+        for (auto it = elem.begin(), eit = expected_elem.begin(); it != elem.end(); ++it, ++eit) {
+          ASSERT_EQ(*it, *eit);
+        }
       }
     }
   }
diff --git a/tests/test_distributed/test_with_dask/test_external_memory.py b/tests/test_distributed/test_with_dask/test_external_memory.py
new file mode 100644
index 000000000000..cf475d90f294
--- /dev/null
+++ b/tests/test_distributed/test_with_dask/test_external_memory.py
@@ -0,0 +1,88 @@
+from typing import List, cast
+
+import numpy as np
+from distributed import Client, Scheduler, Worker, get_worker
+from distributed.utils_test import gen_cluster
+
+import xgboost as xgb
+from xgboost import testing as tm
+from xgboost.compat import concat
+
+
+def run_external_memory(worker_id: int, n_workers: int, comm_args: dict) -> None:
+    n_samples_per_batch = 32
+    n_features = 4
+    n_batches = 16
+    use_cupy = False
+
+    n_threads = get_worker().state.nthreads
+    with xgb.collective.CommunicatorContext(dmlc_communicator="rabit", **comm_args):
+        it = tm.IteratorForTest(
+            *tm.make_batches(
+                n_samples_per_batch,
+                n_features,
+                n_batches,
+                use_cupy,
+                random_state=worker_id,
+            ),
+            cache="cache",
+        )
+        Xy = xgb.DMatrix(it, nthread=n_threads)
+        results: xgb.callback.TrainingCallback.EvalsLog = {}
+        booster = xgb.train(
+            {"tree_method": "hist", "nthread": n_threads},
+            Xy,
+            evals=[(Xy, "Train")],
+            num_boost_round=32,
+            evals_result=results,
+        )
+        assert tm.non_increasing(cast(List[float], results["Train"]["rmse"]))
+
+    lx, ly, lw = [], [], []
+    for i in range(n_workers):
+        x, y, w = tm.make_batches(
+            n_samples_per_batch,
+            n_features,
+            n_batches,
+            use_cupy,
+            random_state=i,
+        )
+        lx.extend(x)
+        ly.extend(y)
+        lw.extend(w)
+
+    X = concat(lx)
+    yconcat = concat(ly)
+    wconcat = concat(lw)
+    Xy = xgb.DMatrix(X, yconcat, wconcat, nthread=n_threads)
+
+    results_local: xgb.callback.TrainingCallback.EvalsLog = {}
+    booster = xgb.train(
+        {"tree_method": "hist", "nthread": n_threads},
+        Xy,
+        evals=[(Xy, "Train")],
+        num_boost_round=32,
+        evals_result=results_local,
+    )
+    np.testing.assert_allclose(
+        results["Train"]["rmse"], results_local["Train"]["rmse"], rtol=1e-4
+    )
+
+
+@gen_cluster(client=True)
+async def test_external_memory(
+    client: Client, s: Scheduler, a: Worker, b: Worker
+) -> None:
+    workers = tm.get_client_workers(client)
+    args = await client.sync(
+        xgb.dask._get_rabit_args,
+        len(workers),
+        None,
+        client,
+    )
+    n_workers = len(workers)
+
+    futs = client.map(
+        run_external_memory, range(n_workers), n_workers=n_workers, comm_args=args
+    )
+    await client.gather(futs)