diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index 98d31acf8c6c..32f8e70bec7f 100644 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -57,7 +57,8 @@ Suggests: igraph (>= 1.0.1), float, titanic, - RhpcBLASctl + RhpcBLASctl, + survival Depends: R (>= 4.3.0) Imports: diff --git a/R-package/NAMESPACE b/R-package/NAMESPACE index c9e085e77e0a..f6cc9062ca4d 100644 --- a/R-package/NAMESPACE +++ b/R-package/NAMESPACE @@ -13,6 +13,7 @@ S3method(predict,xgb.Booster) S3method(print,xgb.Booster) S3method(print,xgb.DMatrix) S3method(print,xgb.cv.synchronous) +S3method(print,xgboost) S3method(setinfo,xgb.Booster) S3method(setinfo,xgb.DMatrix) S3method(variable.names,xgb.Booster) diff --git a/R-package/R/utils.R b/R-package/R/utils.R index 69f358751dc8..3f67ff23c9f7 100644 --- a/R-package/R/utils.R +++ b/R-package/R/utils.R @@ -30,6 +30,40 @@ NVL <- function(x, val) { return(c('rank:pairwise', 'rank:ndcg', 'rank:map')) } +.OBJECTIVES_NON_DEFAULT_MODE <- function() { + return(c("reg:logistic", "binary:logitraw", "multi:softmax")) +} + +.BINARY_CLASSIF_OBJECTIVES <- function() { + return(c("binary:logistic", "binary:hinge")) +} + +.MULTICLASS_CLASSIF_OBJECTIVES <- function() { + return("multi:softprob") +} + +.SURVIVAL_RIGHT_CENSORING_OBJECTIVES <- function() { # nolint + return(c("survival:cox", "survival:aft")) +} + +.SURVIVAL_ALL_CENSORING_OBJECTIVES <- function() { # nolint + return("survival:aft") +} + +.REGRESSION_OBJECTIVES <- function() { + return(c( + "reg:squarederror", "reg:squaredlogerror", "reg:logistic", "reg:pseudohubererror", + "reg:absoluteerror", "reg:quantileerror", "count:poisson", "reg:gamma", "reg:tweedie" + )) +} + +.MULTI_TARGET_OBJECTIVES <- function() { + return(c( + "reg:squarederror", "reg:squaredlogerror", "reg:logistic", "reg:pseudohubererror", + "reg:quantileerror", "reg:gamma" + )) +} + # # Low-level functions for boosting -------------------------------------------- diff --git a/R-package/R/xgb.Booster.R b/R-package/R/xgb.Booster.R index 77b33f16db44..cfea11ae33c6 100644 --- a/R-package/R/xgb.Booster.R +++ b/R-package/R/xgb.Booster.R @@ -663,9 +663,8 @@ validate.features <- function(bst, newdata) { #' data(agaricus.train, package = "xgboost") #' train <- agaricus.train #' -#' bst <- xgboost( -#' data = train$data, -#' label = train$label, +#' bst <- xgb.train( +#' data = xgb.DMatrix(train$data, label = train$label), #' max_depth = 2, #' eta = 1, #' nthread = 2, @@ -767,9 +766,8 @@ xgb.attributes <- function(object) { #' data.table::setDTthreads(nthread) #' train <- agaricus.train #' -#' bst <- xgboost( -#' data = train$data, -#' label = train$label, +#' bst <- xgb.train( +#' data = xgb.DMatrix(train$data, label = train$label), #' max_depth = 2, #' eta = 1, #' nthread = nthread, @@ -817,9 +815,8 @@ xgb.config <- function(object) { #' data(agaricus.train, package = "xgboost") #' train <- agaricus.train #' -#' bst <- xgboost( -#' data = train$data, -#' label = train$label, +#' bst <- xgb.train( +#' data = xgb.DMatrix(train$data, label = train$label), #' max_depth = 2, #' eta = 1, #' nthread = 2, @@ -1230,9 +1227,8 @@ xgb.is.same.Booster <- function(obj1, obj2) { #' data(agaricus.train, package = "xgboost") #' train <- agaricus.train #' -#' bst <- xgboost( -#' data = train$data, -#' label = train$label, +#' bst <- xgb.train( +#' data = xgb.DMatrix(train$data, label = train$label), #' max_depth = 2, #' eta = 1, #' nthread = 2, diff --git a/R-package/R/xgb.DMatrix.R b/R-package/R/xgb.DMatrix.R index 15f6faed0ba0..d87d1cbf71c2 100644 --- a/R-package/R/xgb.DMatrix.R +++ b/R-package/R/xgb.DMatrix.R @@ -853,36 +853,6 @@ xgb.DMatrix.hasinfo <- function(object, info) { } -# get dmatrix from data, label -# internal helper method -xgb.get.DMatrix <- function(data, label, missing, weight, nthread) { - if (inherits(data, "dgCMatrix") || is.matrix(data)) { - if (is.null(label)) { - stop("label must be provided when data is a matrix") - } - dtrain <- xgb.DMatrix(data, label = label, missing = missing, nthread = nthread) - if (!is.null(weight)) { - setinfo(dtrain, "weight", weight) - } - } else { - if (!is.null(label)) { - warning("xgboost: label will be ignored.") - } - if (is.character(data)) { - data <- path.expand(data) - dtrain <- xgb.DMatrix(data[1]) - } else if (inherits(data, "xgb.DMatrix")) { - dtrain <- data - } else if (inherits(data, "data.frame")) { - stop("xgboost doesn't support data.frame as input. Convert it to matrix first.") - } else { - stop("xgboost: invalid input data") - } - } - return(dtrain) -} - - #' Dimensions of xgb.DMatrix #' #' Returns a vector of numbers of rows and of columns in an \code{xgb.DMatrix}. diff --git a/R-package/R/xgb.dump.R b/R-package/R/xgb.dump.R index 2fa5bcb2f628..ef7202a1a5db 100644 --- a/R-package/R/xgb.dump.R +++ b/R-package/R/xgb.dump.R @@ -29,8 +29,8 @@ #' data(agaricus.test, package='xgboost') #' train <- agaricus.train #' test <- agaricus.test -#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2, -#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") +#' bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2, +#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") #' # save the model in file 'xgb.model.dump' #' dump_path = file.path(tempdir(), 'model.dump') #' xgb.dump(bst, dump_path, with_stats = TRUE) diff --git a/R-package/R/xgb.importance.R b/R-package/R/xgb.importance.R index 547d9677b798..bbf816a0d6cc 100644 --- a/R-package/R/xgb.importance.R +++ b/R-package/R/xgb.importance.R @@ -46,9 +46,8 @@ #' # binomial classification using "gbtree": #' data(agaricus.train, package = "xgboost") #' -#' bst <- xgboost( -#' data = agaricus.train$data, -#' label = agaricus.train$label, +#' bst <- xgb.train( +#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), #' max_depth = 2, #' eta = 1, #' nthread = 2, @@ -59,9 +58,8 @@ #' xgb.importance(model = bst) #' #' # binomial classification using "gblinear": -#' bst <- xgboost( -#' data = agaricus.train$data, -#' label = agaricus.train$label, +#' bst <- xgb.train( +#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), #' booster = "gblinear", #' eta = 0.3, #' nthread = 1, @@ -73,9 +71,11 @@ #' # multiclass classification using "gbtree": #' nclass <- 3 #' nrounds <- 10 -#' mbst <- xgboost( -#' data = as.matrix(iris[, -5]), -#' label = as.numeric(iris$Species) - 1, +#' mbst <- xgb.train( +#' data = xgb.DMatrix( +#' as.matrix(iris[, -5]), +#' label = as.numeric(iris$Species) - 1 +#' ), #' max_depth = 3, #' eta = 0.2, #' nthread = 2, @@ -99,9 +99,11 @@ #' ) #' #' # multiclass classification using "gblinear": -#' mbst <- xgboost( -#' data = scale(as.matrix(iris[, -5])), -#' label = as.numeric(iris$Species) - 1, +#' mbst <- xgb.train( +#' data = xgb.DMatrix( +#' scale(as.matrix(iris[, -5])), +#' label = as.numeric(iris$Species) - 1 +#' ), #' booster = "gblinear", #' eta = 0.2, #' nthread = 1, diff --git a/R-package/R/xgb.model.dt.tree.R b/R-package/R/xgb.model.dt.tree.R index ff416b73e38a..73cdecc5c3ae 100644 --- a/R-package/R/xgb.model.dt.tree.R +++ b/R-package/R/xgb.model.dt.tree.R @@ -43,9 +43,8 @@ #' nthread <- 1 #' data.table::setDTthreads(nthread) #' -#' bst <- xgboost( -#' data = agaricus.train$data, -#' label = agaricus.train$label, +#' bst <- xgb.train( +#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), #' max_depth = 2, #' eta = 1, #' nthread = nthread, diff --git a/R-package/R/xgb.plot.deepness.R b/R-package/R/xgb.plot.deepness.R index 8e1972374546..956ee9c83fd0 100644 --- a/R-package/R/xgb.plot.deepness.R +++ b/R-package/R/xgb.plot.deepness.R @@ -48,9 +48,8 @@ #' data.table::setDTthreads(nthread) #' #' ## Change max_depth to a higher number to get a more significant result -#' bst <- xgboost( -#' data = agaricus.train$data, -#' label = agaricus.train$label, +#' bst <- xgb.train( +#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), #' max_depth = 6, #' nthread = nthread, #' nrounds = 50, diff --git a/R-package/R/xgb.plot.importance.R b/R-package/R/xgb.plot.importance.R index 1848a3a86e53..199595cb8ddf 100644 --- a/R-package/R/xgb.plot.importance.R +++ b/R-package/R/xgb.plot.importance.R @@ -51,9 +51,8 @@ #' nthread <- 2 #' data.table::setDTthreads(nthread) #' -#' bst <- xgboost( -#' data = agaricus.train$data, -#' label = agaricus.train$label, +#' bst <- xgb.train( +#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), #' max_depth = 3, #' eta = 1, #' nthread = nthread, diff --git a/R-package/R/xgb.plot.multi.trees.R b/R-package/R/xgb.plot.multi.trees.R index e6d678ee7a4f..19a114071509 100644 --- a/R-package/R/xgb.plot.multi.trees.R +++ b/R-package/R/xgb.plot.multi.trees.R @@ -35,9 +35,8 @@ #' nthread <- 2 #' data.table::setDTthreads(nthread) #' -#' bst <- xgboost( -#' data = agaricus.train$data, -#' label = agaricus.train$label, +#' bst <- xgb.train( +#' data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), #' max_depth = 15, #' eta = 1, #' nthread = nthread, diff --git a/R-package/R/xgb.plot.shap.R b/R-package/R/xgb.plot.shap.R index 788a095399ed..be3f7116034c 100644 --- a/R-package/R/xgb.plot.shap.R +++ b/R-package/R/xgb.plot.shap.R @@ -82,9 +82,8 @@ #' data.table::setDTthreads(nthread) #' nrounds <- 20 #' -#' bst <- xgboost( -#' agaricus.train$data, -#' agaricus.train$label, +#' bst <- xgb.train( +#' data = xgb.DMatrix(agaricus.train$data, agaricus.train$label), #' nrounds = nrounds, #' eta = 0.1, #' max_depth = 3, @@ -108,9 +107,8 @@ #' set.seed(123) #' is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values #' -#' mbst <- xgboost( -#' data = x, -#' label = as.numeric(iris$Species) - 1, +#' mbst <- xgb.train( +#' data = xgb.DMatrix(x, label = as.numeric(iris$Species) - 1), #' nrounds = nrounds, #' max_depth = 2, #' eta = 0.3, diff --git a/R-package/R/xgb.plot.tree.R b/R-package/R/xgb.plot.tree.R index 5ed1e70f695a..502de3f52d61 100644 --- a/R-package/R/xgb.plot.tree.R +++ b/R-package/R/xgb.plot.tree.R @@ -68,9 +68,8 @@ #' @examples #' data(agaricus.train, package = "xgboost") #' -#' bst <- xgboost( -#' data = agaricus.train$data, -#' label = agaricus.train$label, +#' bst <- xgb.train( +#' data = xgb.DMatrix(agaricus.train$data, agaricus.train$label), #' max_depth = 3, #' eta = 1, #' nthread = 2, diff --git a/R-package/R/xgb.train.R b/R-package/R/xgb.train.R index 30bf1f1ea149..a0933213be5a 100644 --- a/R-package/R/xgb.train.R +++ b/R-package/R/xgb.train.R @@ -182,12 +182,6 @@ #' as R attributes, and thus do not get saved when using XGBoost's own serializaters like #' \link{xgb.save} (but are kept when using R serializers like \link{saveRDS}). #' @param ... other parameters to pass to \code{params}. -#' @param label vector of response values. Should not be provided when data is -#' a local data file name or an \code{xgb.DMatrix}. -#' @param missing by default is set to NA, which means that NA values should be considered as 'missing' -#' by the algorithm. Sometimes, 0 or other extreme value might be used to represent missing values. -#' This parameter is only used when input is a dense matrix. -#' @param weight a vector indicating the weight for each row of the input. #' #' @return #' An object of class \code{xgb.Booster}. @@ -328,12 +322,10 @@ #' early_stopping_rounds = 3) #' #' ## An 'xgboost' interface example: -#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, -#' max_depth = 2, eta = 1, nthread = nthread, nrounds = 2, -#' objective = "binary:logistic") +#' bst <- xgboost(x = agaricus.train$data, y = factor(agaricus.train$label), +#' params = list(max_depth = 2, eta = 1), nthread = nthread, nrounds = 2) #' pred <- predict(bst, agaricus.test$data) #' -#' @rdname xgb.train #' @export xgb.train <- function(params = list(), data, nrounds, evals = list(), obj = NULL, feval = NULL, verbose = 1, print_every_n = 1L, diff --git a/R-package/R/xgboost.R b/R-package/R/xgboost.R index a1d37358162c..9ea66731bf81 100644 --- a/R-package/R/xgboost.R +++ b/R-package/R/xgboost.R @@ -1,32 +1,1005 @@ -# Simple interface for training an xgboost model that wraps \code{xgb.train}. -# Its documentation is combined with xgb.train. -# -#' @rdname xgb.train +prescreen.parameters <- function(params) { + if (!NROW(params)) { + return(list()) + } + if (!is.list(params)) { + stop("'params' must be a list or NULL.") + } + + params <- params[!is.null(params)] + + if ("num_class" %in% names(params)) { + stop("'num_class' cannot be manually specified for 'xgboost()'. Pass a factor 'y' instead.") + } + if ("process_type" %in% names(params)) { + if (params$process_type != "default") { + stop("Non-default 'process_type' is not supported for 'xgboost()'. Try 'xgb.train()'.") + } + } + + return(params) +} + +prescreen.objective <- function(objective) { + if (!is.null(objective)) { + if (objective %in% .OBJECTIVES_NON_DEFAULT_MODE()) { + stop( + "Objectives with non-default prediction mode (", + paste(.OBJECTIVES_NON_DEFAULT_MODE(), collapse = ", "), + ") are not supported in 'xgboost()'. Try 'xgb.train()'." + ) + } + + if (!is.character(objective) || length(objective) != 1L || is.na(objective)) { + stop("'objective' must be a single character/string variable.") + } + } +} + +process.base.margin <- function(base_margin, nrows, ncols) { + if (!NROW(base_margin)) { + return(NULL) + } + if (is.array(base_margin) && length(dim(base_margin)) > 2) { + stop( + "'base_margin' should not have more than 2 dimensions for any objective (got: ", + length(dim(base_margin)), + " dimensions)." + ) + } + if (inherits(base_margin, c("sparseMatrix", "sparseVector"))) { + warning( + "Got a sparse matrix type (class: ", + paste(class(base_margin), collapse = ", "), + ") for 'base_margin'. Will convert to dense matrix." + ) + base_margin <- as.matrix(base_margin) + } + if (NROW(base_margin) != nrows) { + stop( + "'base_margin' has incorrect number of rows. Expected: ", + nrows, + ". Got: ", + NROW(base_margin) + ) + } + + if (ncols == 1L) { + if (inherits(base_margin, c("matrix", "data.frame"))) { + if (ncol(base_margin) != 1L) { + stop("'base_margin' should be a 1-d vector for the given objective and data.") + } + if (is.data.frame(base_margin)) { + base_margin <- base_margin[[1L]] + } else { + base_margin <- base_margin[, 1L] + } + } + if (!is.numeric(base_margin)) { + base_margin <- as.numeric(base_margin) + } + } else { + supported_multicol <- c("matrix", "data.frame") + if (!inherits(base_margin, supported_multicol)) { + stop( + "'base_margin' should be a matrix with ", + ncols, + " columns for the given objective and data. Got class: ", + paste(class(base_margin), collapse = ", ") + ) + } + if (ncol(base_margin) != ncols) { + stop( + "'base_margin' has incorrect number of columns. Expected: ", + ncols, + ". Got: ", + ncol(base_margin) + ) + } + if (!is.matrix(base_margin)) { + base_margin <- as.matrix(base_margin) + } + } + + return(base_margin) +} + +process.y.margin.and.objective <- function( + y, + base_margin, + objective, + params +) { + + if (!NROW(y)) { + stop("Passed empty 'y'.") + } + + if (is.array(y) && length(dim(y)) > 2) { + stop( + "'y' should not have more than 2 dimensions for any objective (got: ", + length(dim(y)), + ")." + ) + } + + if (inherits(y, c("sparseMatrix", "sparseVector"))) { + warning( + "Got a sparse matrix type (class: ", + paste(class(y), collapse = ", "), + ") for 'y'. Will convert to dense matrix." + ) + y <- as.matrix(y) + } + + if (is.character(y)) { + if (!is.vector(y)) { + if (NCOL(y) > 1L) { + stop("Multi-column categorical 'y' is not supported.") + } + y <- as.vector(y) + } + y <- factor(y) + } + + if (is.logical(y)) { + if (!is.vector(y)) { + if (NCOL(y) > 1L) { + stop("Multi-column logical/boolean 'y' is not supported.") + } + y <- as.vector(y) + } + y <- factor(y, c(FALSE, TRUE)) + } + + if (is.factor(y)) { + + y_levels <- levels(y) + if (length(y_levels) < 2) { + stop("Factor 'y' has less than 2 levels.") + } + if (length(y_levels) == 2) { + if (is.null(objective)) { + objective <- "binary:logistic" + } else { + if (!(objective %in% .BINARY_CLASSIF_OBJECTIVES())) { + stop( + "Got binary 'y' - supported objectives for this data are: ", + paste(.BINARY_CLASSIF_OBJECTIVES(), collapse = ", "), + ". Was passed: ", + objective + ) + } + } + + if (!is.null(base_margin)) { + base_margin <- process.base.margin(base_margin, length(y), 1) + } + + out <- list( + params = list( + objective = objective + ), + metadata = list( + y_levels = y_levels, + n_targets = 1 + ) + ) + } else { # length(levels) > 2 + if (is.null(objective)) { + objective <- "multi:softprob" + } else { + if (!(objective %in% .MULTICLASS_CLASSIF_OBJECTIVES())) { + stop( + "Got non-binary factor 'y' - supported objectives for this data are: ", + paste(.MULTICLASS_CLASSIF_OBJECTIVES(), collapse = ", "), + ". Was passed: ", + objective + ) + } + } + + if (!is.null(base_margin)) { + base_margin <- process.base.margin(base_margin, length(y), length(y_levels)) + } + + out <- list( + params = list( + objective = objective, + num_class = length(y_levels) + ), + metadata = list( + y_levels = y_levels, + n_targets = length(y_levels) + ) + ) + } + + out$dmatrix_args <- list( + label = as.numeric(y) - 1, + base_margin = base_margin + ) + + } else if (inherits(y, "Surv")) { + + y_attr <- attributes(y) + supported_surv_types <- c("left", "right", "interval") + if (!(y_attr$type %in% supported_surv_types)) { + stop( + "Survival objectives are only supported for types: ", + paste(supported_surv_types, collapse = ", "), + ". Was passed: ", + y_attr$type + ) + } + + if (is.null(objective)) { + objective <- "survival:aft" + } else { + if (y_attr$type == "right") { + if (!(objective %in% .SURVIVAL_RIGHT_CENSORING_OBJECTIVES())) { + stop( + "Got right-censored 'y' variable - supported objectives for this data are: ", + paste(.SURVIVAL_RIGHT_CENSORING_OBJECTIVES(), collapse = ", "), + ". Was passed: ", + objective + ) + } + } else { + if (!(objective %in% .SURVIVAL_ALL_CENSORING_OBJECTIVES())) { + stop( + "Got ", y_attr$type, "-censored 'y' variable - supported objectives for this data are:", + paste(.SURVIVAL_ALL_CENSORING_OBJECTIVES(), collapse = ", "), + ". Was passed: ", + objective + ) + } + } + } + + if (!is.null(base_margin)) { + base_margin <- process.base.margin(base_margin, nrow(y), 1) + } + + out <- list( + params = list( + objective = objective + ), + metadata = list( + n_targets = 1 + ) + ) + + # Note: the 'Surv' object class that is passed as 'y' might have either 2 or 3 columns + # depending on the type of censoring, and the last column in both cases is the one that + # indicates the observation type (e.g. censored / uncensored). + # In the case of interval censoring, the second column will not always have values with + # infinites filled in. For more information, see the code behind the 'print.Surv' method. + + if (objective == "survival:cox") { + # Can only get here when using right censoring + if (y_attr$type != "right") { + stop("Internal error.") + } + + out$dmatrix_args <- list( + label = y[, 1L] * (2 * (y[, 2L] - 0.5)) + ) + + } else { + if (y_attr$type == "left") { + lb <- ifelse( + y[, 2L] == 0, + 0, + y[, 1L] + ) + ub <- y[, 1L] + out$dmatrix_args <- list( + label_lower_bound = lb, + label_upper_bound = ub + ) + } else if (y_attr$type == "right") { + lb <- y[, 1L] + ub <- ifelse( + y[, 2L] == 0, + Inf, + y[, 1L] + ) + out$dmatrix_args <- list( + label_lower_bound = lb, + label_upper_bound = ub + ) + } else if (y_attr$type == "interval") { + out$dmatrix_args <- list( + label_lower_bound = ifelse(y[, 3L] == 2, 0, y[, 1L]), + label_upper_bound = ifelse( + y[, 3L] == 0, Inf, + ifelse(y[, 3L] == 3, y[, 2L], y[, 1L]) + ) + ) + } + + if (min(out$dmatrix_args$label_lower_bound) < 0) { + stop("Survival objectives are only defined for non-negative 'y'.") + } + } + + out$dmatrix_args$base_margin <- base_margin + + } else if (is.vector(y)) { + + if (is.null(objective)) { + objective <- "reg:squarederror" + } else if (!(objective %in% .REGRESSION_OBJECTIVES())) { + stop( + "Got numeric 'y' - supported objectives for this data are: ", + paste(.REGRESSION_OBJECTIVES(), collapse = ", "), + ". Was passed: ", + objective + ) + } + + n_targets <- 1L + if (objective == "reg:quantileerror" && NROW(params$quantile_alpha) > 1) { + n_targets <- NROW(params$quantile_alpha) + } + + if (!is.null(base_margin)) { + base_margin <- process.base.margin(base_margin, length(y), n_targets) + } + + out <- list( + params = list( + objective = objective + ), + metadata = list( + n_targets = n_targets + ), + dmatrix_args = list( + label = as.numeric(y), + base_margin = base_margin + ) + ) + + } else if (is.data.frame(y)) { + if (ncol(y) == 1L) { + return(process.y.margin.and.objective(y[[1L]], base_margin, objective, params)) + } + + if (is.null(objective)) { + objective <- "reg:squarederror" + } else if (!(objective %in% .MULTI_TARGET_OBJECTIVES())) { + stop( + "Got multi-column 'y' - supported objectives for this data are: ", + paste(.MULTI_TARGET_OBJECTIVES(), collapse = ", "), + ". Was passed: ", + objective + ) + } + + y_names <- names(y) + y <- lapply(y, function(x) { + if (!inherits(x, c("numeric", "integer"))) { + stop( + "Multi-target 'y' only supports 'numeric' and 'integer' types. Got: ", + paste(class(x), collapse = ", ") + ) + } + return(as.numeric(x)) + }) + y <- as.data.frame(y) |> as.matrix() + + if (!is.null(base_margin)) { + base_margin <- process.base.margin(base_margin, length(y), ncol(y)) + } + + out <- list( + params = list( + objective = objective + ), + dmatrix_args = list( + label = y, + base_margin = base_margin + ), + metadata = list( + y_names = y_names, + n_targets = ncol(y) + ) + ) + + } else if (is.matrix(y)) { + if (ncol(y) == 1L) { + return(process.y.margin.and.objective(as.vector(y), base_margin, objective, params)) + } + + if (!is.null(objective) && !(objective %in% .MULTI_TARGET_OBJECTIVES())) { + stop( + "Got multi-column 'y' - supported objectives for this data are: ", + paste(.MULTI_TARGET_OBJECTIVES(), collapse = ", "), + ". Was passed: ", + objective + ) + } + if (is.null(objective)) { + objective <- "reg:squarederror" + } + + y_names <- colnames(y) + if (storage.mode(y) != "double") { + storage.mode(y) <- "double" + } + + if (!is.null(base_margin)) { + base_margin <- process.base.margin(base_margin, nrow(y), ncol(y)) + } + + out <- list( + params = list( + objective = objective + ), + dmatrix_args = list( + label = y, + base_margin = base_margin + ), + metadata = list( + n_targets = ncol(y) + ) + ) + + if (NROW(y_names) == ncol(y)) { + out$metadata$y_names <- y_names + } + + } else { + stop("Passed 'y' object with unsupported class: ", paste(class(y), collapse = ", ")) + } + + return(out) +} + +process.row.weights <- function(w, lst_args) { + if (!is.null(w)) { + if ("label" %in% names(lst_args$dmatrix_args)) { + nrow_y <- NROW(lst_args$dmatrix_args$label) + } else if ("label_lower_bound" %in% names(lst_args$dmatrix_args)) { + nrow_y <- length(lst_args$dmatrix_args$label_lower_bound) + } else { + stop("Internal error.") + } + if (!is.numeric(w)) { + w <- as.numeric(w) + } + if (length(w) != nrow_y) { + stop( + "'weights' must be a 1-d vector with the same length as 'y' (", + length(w), " vs. ", nrow_y, ")." + ) + } + lst_args$dmatrix_args$weight <- w + } + return(lst_args) +} + +check.nthreads <- function(nthreads) { + if (is.null(nthreads)) { + return(1L) + } + if (!inherits(nthreads, c("numeric", "integer")) || !NROW(nthreads)) { + stop("'nthreads' must be a positive scalar value.") + } + if (length(nthreads) > 1L) { + nthreads <- utils::head(nthreads, 1L) + } + if (is.na(nthreads) || nthreads < 0) { + stop("Passed invalid 'nthreads': ", nthreads) + } + if (is.numeric(nthreads)) { + if (floor(nthreads) != nthreads) { + stop("'nthreads' must be an integer.") + } + } + return(as.integer(nthreads)) +} + +check.can.use.qdm <- function(x, params) { + if ("booster" %in% names(params)) { + if (params$booster == "gblinear") { + return(FALSE) + } + } + if ("tree_method" %in% names(params)) { + if (params$tree_method %in% c("exact", "approx")) { + return(FALSE) + } + } + return(TRUE) +} + +process.x.and.col.args <- function( + x, + monotone_constraints, + interaction_constraints, + feature_weights, + lst_args, + use_qdm +) { + if (is.null(x)) { + stop("'x' cannot be NULL.") + } + if (inherits(x, "xgb.DMatrix")) { + stop("Cannot pass 'xgb.DMatrix' as 'x' to 'xgboost()'. Try 'xgb.train()' instead.") + } + supported_x_types <- c("data.frame", "matrix", "dgTMatrix", "dgCMatrix", "dgRMatrix") + if (!inherits(x, supported_x_types)) { + stop( + "'x' must be one of the following classes: ", + paste(supported_x_types, collapse = ", "), + ". Got: ", + paste(class(x), collapse = ", ") + ) + } + if (use_qdm && inherits(x, "sparseMatrix") && !inherits(x, "dgRMatrix")) { + x <- methods::as(x, "RsparseMatrix") + if (!inherits(x, "RsparseMatrix")) { + stop("Internal error: casting sparse matrix did not yield 'dgRMatrix'.") + } + } + + if (NROW(feature_weights)) { + if (is.list(feature_weights)) { + feature_weights <- unlist(feature_weights) + } + if (!inherits(feature_weights, c("numeric", "integer"))) { + stop("'feature_weights' must be a numeric vector or named list matching to columns of 'x'.") + } + if (NROW(names(feature_weights)) && NROW(colnames(x))) { + matched <- match(colnames(x), names(feature_weights)) + matched <- matched[!is.na(matched)] + matched <- matched[!duplicated(matched)] + if (length(matched) > 0 && length(matched) < length(feature_weights)) { + stop( + "'feature_weights' names do not contain all columns of 'x'. Missing: ", + utils::head(setdiff(colnames(x), names(feature_weights))) + ) + } + if (length(matched)) { + feature_weights <- feature_weights[matched] + } else { + warning("Names of 'feature_weights' do not match with 'x'. Names will be ignored.") + } + } + + lst_args$dmatrix_args$feature_weights <- unname(feature_weights) + } + + if (NROW(monotone_constraints)) { + + if (NROW(monotone_constraints) > ncol(x)) { + stop( + "'monotone_constraints' contains more entries than there are columns in 'x' (", + NROW(monotone_constraints), " vs. ", ncol(x), ")." + ) + } + + if (is.list(monotone_constraints)) { + + if (!NROW(names(monotone_constraints))) { + stop( + "If passing 'monotone_constraints' as a named list,", + " must have names matching to columns of 'x'." + ) + } + if (!NROW(colnames(x))) { + stop("If passing 'monotone_constraints' as a named list, 'x' must have column names.") + } + if (anyDuplicated(names(monotone_constraints))) { + stop( + "'monotone_constraints' contains duplicated names: ", + paste( + names(monotone_constraints)[duplicated(names(monotone_constraints))] |> utils::head(), + collapse = ", " + ) + ) + } + if (NROW(setdiff(names(monotone_constraints), colnames(x)))) { + stop( + "'monotone_constraints' contains column names not present in 'x': ", + paste(utils::head(names(monotone_constraints)), collapse = ", ") + ) + } + + vec_monotone_constr <- rep(0, ncol(x)) + matched <- match(names(monotone_constraints), colnames(x)) + vec_monotone_constr[matched] <- unlist(monotone_constraints) + lst_args$params$monotone_constraints <- unname(vec_monotone_constr) + + } else if (inherits(monotone_constraints, c("numeric", "integer"))) { + + if (NROW(names(monotone_constraints)) && NROW(colnames(x))) { + if (length(monotone_constraints) < ncol(x)) { + return( + process.x.and.col.args( + x, + as.list(monotone_constraints), + interaction_constraints, + feature_weights, + lst_args, + use_qdm + ) + ) + } else { + matched <- match(names(monotone_constraints), colnames(x)) + matched <- matched[!is.na(matched)] + matched <- matched[!duplicated(matched)] + if (length(matched)) { + monotone_constraints <- monotone_constraints[matched] + } else { + warning("Names of 'monotone_constraints' do not match with 'x'. Names will be ignored.") + } + } + } else { + if (length(monotone_constraints) != ncol(x)) { + stop( + "If passing 'monotone_constraints' as unnamed vector or not using column names,", + " must have length matching to number of columns in 'x'. Got: ", + length(monotone_constraints), " (vs. ", ncol(x), ")" + ) + } + } + + lst_args$params$monotone_constraints <- unname(monotone_constraints) + + } else if (is.character(monotone_constraints)) { + lst_args$params$monotone_constraints <- monotone_constraints + } else { + stop( + "Passed unsupported type for 'monotone_constraints': ", + paste(class(monotone_constraints), collapse = ", ") + ) + } + } + + if (NROW(interaction_constraints)) { + if (!is.list(interaction_constraints)) { + stop("'interaction_constraints' must be a list of vectors.") + } + cnames <- colnames(x) + lst_args$params$interaction_constraints <- lapply(interaction_constraints, function(idx) { + if (!NROW(idx)) { + stop("Elements in 'interaction_constraints' cannot be empty.") + } + + if (is.character(idx)) { + if (!NROW(cnames)) { + stop( + "Passed a character vector for 'interaction_constraints', but 'x' ", + "has no column names to match them against." + ) + } + out <- match(idx, cnames) - 1L + if (anyNA(out)) { + stop( + "'interaction_constraints' contains column names not present in 'x': ", + paste(utils::head(idx[which(is.na(out))]), collapse = ", ") + ) + } + return(out) + } else if (inherits(idx, c("numeric", "integer"))) { + if (anyNA(idx)) { + stop("'interaction_constraints' cannot contain NA values.") + } + if (min(idx) < 1) { + stop("Column indices for 'interaction_constraints' must follow base-1 indexing.") + } + if (max(idx) > ncol(x)) { + stop("'interaction_constraints' contains invalid column indices.") + } + if (is.numeric(idx)) { + if (any(idx != floor(idx))) { + stop( + "'interaction_constraints' must contain only integer indices. Got non-integer: ", + paste(utils::head(idx[which(idx != floor(idx))]), collapse = ", ") + ) + } + } + return(idx - 1L) + } else { + stop( + "Elements in 'interaction_constraints' must be vectors of types ", + "'integer', 'numeric', or 'character'. Got: ", + paste(class(idx), collapse = ", ") + ) + } + }) + } + + lst_args$dmatrix_args$data <- x + return(lst_args) +} + +#' @noMd #' @export -xgboost <- function(data = NULL, label = NULL, missing = NA, weight = NULL, - params = list(), nrounds, - verbose = 1, print_every_n = 1L, - early_stopping_rounds = NULL, maximize = NULL, - save_period = NULL, save_name = "xgboost.model", - xgb_model = NULL, callbacks = list(), ...) { - merged <- check.booster.params(params, ...) - dtrain <- xgb.get.DMatrix( - data = data, - label = label, - missing = missing, - weight = weight, - nthread = merged$nthread +#' @title Fit XGBoost Model +#' @description Fits an XGBoost model (boosted decision tree ensemble) to given x/y data. +#' +#' See the tutorial \href{https://xgboost.readthedocs.io/en/stable/tutorials/model.html}{ +#' Introduction to Boosted Trees} for a longer explanation of what XGBoost does. +#' +#' This function is intended to provide a more user-friendly interface for XGBoost that follows +#' R's conventions for model fitting and predictions, but which doesn't expose all of the +#' possible functionalities of the core XGBoost library. +#' +#' See \link{xgb.train} for a more flexible low-level alternative which is similar across different +#' language bindings of XGBoost and which exposes the full library's functionalities. +#' @details For package authors using `xgboost` as a dependency, it is highly recommended to use +#' \link{xgb.train} in package code instead of `xgboost()`, since it has a more stable interface +#' and performs fewer data conversions and copies along the way. +#' @references \itemize{ +#' \item Chen, Tianqi, and Carlos Guestrin. "Xgboost: A scalable tree boosting system." +#' Proceedings of the 22nd acm sigkdd international conference on knowledge discovery and +#' data mining. 2016. +#' \item \url{https://xgboost.readthedocs.io/en/stable/} +#' } +#' @param x The features / covariates. Can be passed as:\itemize{ +#' \item A numeric or integer `matrix`. +#' \item A `data.frame`, in which all columns are one of the following types:\itemize{ +#' \item `numeric` +#' \item `integer` +#' \item `logical` +#' \item `factor` +#' } +#' +#' Columns of `factor` type will be assumed to be categorical, while other column types will +#' be assumed to be numeric. +#' \item A sparse matrix from the `Matrix` package, either as `dgCMatrix` or `dgRMatrix` class. +#' } +#' +#' Note that categorical features are only supported for `data.frame` inputs, and are automatically +#' determined based on their types. See \link{xgb.train} with \link{xgb.DMatrix} for more flexible +#' variants that would allow something like categorical features on sparse matrices. +#' @param y The response variable. Allowed values are:\itemize{ +#' \item A numeric or integer vector (for regression tasks). +#' \item A factor or character vector (for binary and multi-class classification tasks). +#' \item A logical (boolean) vector (for binary classification tasks). +#' \item A numeric or integer matrix or `data.frame` with numeric/integer columns +#' (for multi-task regression tasks). +#' \item A `Surv` object from the `survival` package (for survival tasks). +#' } +#' +#' If `objective` is `NULL`, the right task will be determined automatically based on +#' the class of `y`. +#' +#' If `objective` is not `NULL`, it must match with the type of `y` - e.g. `factor` types of `y` +#' can only be used with classification objectives and vice-versa. +#' +#' For binary classification, the last factor level of `y` will be used as the "positive" +#' class - that is, the numbers from `predict` will reflect the probabilities of belonging to this +#' class instead of to the first factor level. If `y` is a `logical` vector, then `TRUE` will be +#' set as the last level. +#' @param objective Optimization objective to minimize based on the supplied data, to be passed +#' by name as a string / character (e.g. `reg:absoluteerror`). See the +#' \href{https://xgboost.readthedocs.io/en/stable/parameter.html#learning-task-parameters}{ +#' Learning Task Parameters} page for more detailed information on allowed values. +#' +#' If `NULL` (the default), will be automatically determined from `y` according to the following +#' logic:\itemize{ +#' \item If `y` is a factor with 2 levels, will use `binary:logistic`. +#' \item If `y` is a factor with more than 2 levels, will use `multi:softprob` (number of classes +#' will be determined automatically, should not be passed under `params`). +#' \item If `y` is a `Surv` object from the `survival` package, will use `survival:aft` (note that +#' the only types supported are left / right / interval censored). +#' \item Otherwise, will use `reg:squarederror`. +#' } +#' +#' If `objective` is not `NULL`, it must match with the type of `y` - e.g. `factor` types of `y` +#' can only be used with classification objectives and vice-versa. +#' +#' Note that not all possible `objective` values supported by the core XGBoost library are allowed +#' here - for example, objectives which are a variation of another but with a different default +#' prediction type (e.g. `multi:softmax` vs. `multi:softprob`) are not allowed, and neither are +#' ranking objectives, nor custom objectives at the moment. +#' @param nrounds Number of boosting iterations / rounds. +#' +#' Note that the number of default boosting rounds here is not automatically tuned, and different +#' problems will have vastly different optimal numbers of boosting rounds. +#' @param weights Sample weights for each row in `x` and `y`. If `NULL` (the default), each row +#' will have the same weight. +#' +#' If not `NULL`, should be passed as a numeric vector with length matching to the number of +#' rows in `x`. +#' @param verbosity Verbosity of printing messages. Valid values of 0 (silent), 1 (warning), +#' 2 (info), and 3 (debug). +#' @param nthreads Number of parallel threads to use. If passing zero, will use all CPU threads. +#' @param seed Seed to use for random number generation. If passing `NULL`, will draw a random +#' number using R's PRNG system to use as seed. +#' @param monotone_constraints Optional monotonicity constraints for features. +#' +#' Can be passed either as a named list (when `x` has column names), or as a vector. If passed +#' as a vector and `x` has column names, will try to match the elements by name. +#' +#' A value of `+1` for a given feature makes the model predictions / scores constrained to be +#' a monotonically increasing function of that feature (that is, as the value of the feature +#' increases, the model prediction cannot decrease), while a value of `-1` makes it a monotonically +#' decreasing function. A value of zero imposes no constraint. +#' +#' The input for `monotone_constraints` can be a subset of the columns of `x` if named, in which +#' case the columns that are not referred to in `monotone_constraints` will be assumed to have +#' a value of zero (no constraint imposed on the model for those features). +#' +#' See the tutorial \href{https://xgboost.readthedocs.io/en/stable/tutorials/monotonic.html}{ +#' Monotonic Constraints} for a more detailed explanation. +#' @param interaction_constraints Constraints for interaction representing permitted interactions. +#' The constraints must be specified in the form of a list of vectors referencing columns in the +#' data, e.g. `list(c(1, 2), c(3, 4, 5))` (with these numbers being column indices, numeration +#' starting at 1 - i.e. the first sublist references the first and second columns) or +#' `list(c("Sepal.Length", "Sepal.Width"), c("Petal.Length", "Petal.Width"))` (references +#' columns by names), where each vector is a group of indices of features that are allowed to +#' interact with each other. +#' +#' See the tutorial +#' \href{https://xgboost.readthedocs.io/en/stable/tutorials/feature_interaction_constraint.html}{ +#' Feature Interaction Constraints} for more information. +#' @param feature_weights Feature weights for column sampling. +#' +#' Can be passed either as a vector with length matching to columns of `x`, or as a named +#' list (only if `x` has column names) with names matching to columns of 'x'. If it is a +#' named vector, will try to match the entries to column names of `x` by name. +#' +#' If `NULL` (the default), all columns will have the same weight. +#' @param base_margin Base margin used for boosting from existing model. +#' +#' If passing it, will start the gradient boosting procedure from the scores that are provided +#' here - for example, one can pass the raw scores from a previous model, or some per-observation +#' offset, or similar. +#' +#' Should be either a numeric vector or numeric matrix (for multi-class and multi-target objectives) +#' with the same number of rows as `x` and number of columns corresponding to number of optimization +#' targets, and should be in the untransformed scale (for example, for objective `binary:logistic`, +#' it should have log-odds, not probabilities; and for objective `multi:softprob`, should have +#' number of columns matching to number of classes in the data). +#' +#' Note that, if it contains more than one column, then columns will not be matched by name to +#' the corresponding `y` - `base_margin` should have the same column order that the model will use +#' (for example, for objective `multi:softprob`, columns of `base_margin` will be matched against +#' `levels(y)` by their position, regardless of what `colnames(base_margin)` returns). +#' +#' If `NULL`, will start from zero, but note that for most objectives, an intercept is usually +#' added (controllable through parameter `base_score` instead) when `base_margin` is not passed. +#' @param ... Other training parameters. See the online documentation +#' \href{https://xgboost.readthedocs.io/en/stable/parameter.html}{XGBoost Parameters} for +#' details about possible values and what they do. +#' +#' Note that not all possible values from the core XGBoost library are allowed as `params` for +#' 'xgboost()' - in particular, values which require an already-fitted booster object (such as +#' `process_type`) are not accepted here. +#' @return A model object, inheriting from both `xgboost` and `xgb.Booster`. Compared to the regular +#' `xgb.Booster` model class produced by \link{xgb.train}, this `xgboost` class will have an +#' additional attribute `metadata` containing information which is used for formatting prediction +#' outputs, such as class names for classification problems. +#' @examples +#' library(xgboost) +#' data(mtcars) +#' +#' # Fit a small regression model on the mtcars data +#' model_regression <- xgboost(mtcars[, -1], mtcars$mpg, nthreads = 1, nrounds = 3) +#' predict(model_regression, mtcars, validate_features = TRUE) +#' +#' # Task objective is determined automatically according to the type of 'y' +#' data(iris) +#' model_classif <- xgboost(iris[, -5], iris$Species, nthreads = 1, nrounds = 5) +#' predict(model_classif, iris, validate_features = TRUE) +xgboost <- function( + x, + y, + objective = NULL, + nrounds = 100L, + weights = NULL, + verbosity = 0L, + nthreads = parallel::detectCores(), + seed = 0L, + monotone_constraints = NULL, + interaction_constraints = NULL, + feature_weights = NULL, + base_margin = NULL, + ... +) { + # Note: '...' is a workaround, to be removed later by making all parameters be arguments + params <- list(...) + params <- prescreen.parameters(params) + prescreen.objective(objective) + use_qdm <- check.can.use.qdm(x, params) + lst_args <- process.y.margin.and.objective(y, base_margin, objective, params) + lst_args <- process.row.weights(weights, lst_args) + lst_args <- process.x.and.col.args( + x, + monotone_constraints, + interaction_constraints, + feature_weights, + lst_args, + use_qdm + ) + + if (use_qdm && "max_bin" %in% names(params)) { + lst_args$dmatrix_args$max_bin <- params$max_bin + } + + nthreads <- check.nthreads(nthreads) + lst_args$dmatrix_args$nthread <- nthreads + lst_args$params$nthread <- nthreads + lst_args$params$seed <- seed + + params <- c(lst_args$params, params) + + fn_dm <- if (use_qdm) xgb.QuantileDMatrix else xgb.DMatrix + dm <- do.call(fn_dm, lst_args$dmatrix_args) + model <- xgb.train( + params = params, + data = dm, + nrounds = nrounds, + verbose = verbosity ) + attributes(model)$metadata <- lst_args$metadata + attributes(model)$call <- match.call() + class(model) <- c("xgboost", class(model)) + return(model) +} - evals <- list(train = dtrain) +#' @export +print.xgboost <- function(x, ...) { + cat("XGBoost model object\n") + cat("Call:\n ") + print(attributes(x)$call) + cat("Objective: ", attributes(x)$params$objective, "\n", sep = "") + cat("Number of iterations: ", xgb.get.num.boosted.rounds(x), "\n", sep = "") + cat("Number of features: ", xgb.num_feature(x), "\n", sep = "") - bst <- xgb.train(params, dtrain, nrounds, evals, verbose = verbose, print_every_n = print_every_n, - early_stopping_rounds = early_stopping_rounds, maximize = maximize, - save_period = save_period, save_name = save_name, - xgb_model = xgb_model, callbacks = callbacks, ...) - return(bst) + printable_head <- function(v) { + v_sub <- utils::head(v, 5L) + return( + sprintf( + "%s%s", + paste(v_sub, collapse = ", "), + ifelse(length(v_sub) < length(v), ", ...", "") + ) + ) + } + + if (NROW(attributes(x)$metadata$y_levels)) { + cat( + "Classes: ", + printable_head(attributes(x)$metadata$y_levels), + "\n", + sep = "" + ) + } else if (NROW(attributes(x)$params$quantile_alpha)) { + cat( + "Prediction quantile", + ifelse(length(attributes(x)$params$quantile_alpha) > 1L, "s", ""), + ": ", + printable_head(attributes(x)$params$quantile_alpha), + "\n", + sep = "" + ) + } else if (NROW(attributes(x)$metadata$y_names)) { + cat( + "Prediction targets: ", + printable_head(attributes(x)$metadata$y_names), + "\n", + sep = "" + ) + } else if (attributes(x)$metadata$n_targets > 1L) { + cat( + "Number of predition targets: ", + attributes(x)$metadata$n_targets, + "\n", + sep = "" + ) + } + + return(x) } + #' Training part from Mushroom Data Set #' #' This data set is originally from the Mushroom data set, diff --git a/R-package/demo/basic_walkthrough.R b/R-package/demo/basic_walkthrough.R index 9403bac2064c..c65790109fc2 100644 --- a/R-package/demo/basic_walkthrough.R +++ b/R-package/demo/basic_walkthrough.R @@ -16,29 +16,28 @@ class(train$data) # note: we are putting in sparse matrix here, xgboost naturally handles sparse input # use sparse matrix when your feature is sparse(e.g. when you are using one-hot encoding vector) print("Training xgboost with sparseMatrix") -bst <- xgboost(data = train$data, label = train$label, max_depth = 2, eta = 1, nrounds = 2, - nthread = 2, objective = "binary:logistic") +bst <- xgboost(x = train$data, y = factor(train$label, c(0, 1)), + params = list(max_depth = 2, eta = 1), + nrounds = 2, nthread = 2) # alternatively, you can put in dense matrix, i.e. basic R-matrix print("Training xgboost with Matrix") -bst <- xgboost(data = as.matrix(train$data), label = train$label, max_depth = 2, eta = 1, nrounds = 2, - nthread = 2, objective = "binary:logistic") +bst <- xgboost(x = as.matrix(train$data), y = factor(train$label, c(0, 1)), + params = list(max_depth = 2, eta = 1), + nrounds = 2, nthread = 2) # you can also put in xgb.DMatrix object, which stores label, data and other meta datas needed for advanced features print("Training xgboost with xgb.DMatrix") dtrain <- xgb.DMatrix(data = train$data, label = train$label) -bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, nthread = 2, - objective = "binary:logistic") +params <- list(max_depth = 2, eta = 1, nthread = 2, objective = "binary:logistic") +bst <- xgb.train(data = dtrain, params = params, nrounds = 2) # Verbose = 0,1,2 print("Train xgboost with verbose 0, no message") -bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, - nthread = 2, objective = "binary:logistic", verbose = 0) +bst <- xgb.train(data = dtrain, params = params, nrounds = 2, verbose = 0) print("Train xgboost with verbose 1, print evaluation metric") -bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, - nthread = 2, objective = "binary:logistic", verbose = 1) +bst <- xgb.train(data = dtrain, params = params, nrounds = 2, verbose = 1) print("Train xgboost with verbose 2, also print information about tree") -bst <- xgboost(data = dtrain, max_depth = 2, eta = 1, nrounds = 2, - nthread = 2, objective = "binary:logistic", verbose = 2) +bst <- xgb.train(data = dtrain, params = params, nrounds = 2, verbose = 2) # you can also specify data as file path to a LIBSVM format input # since we do not have this file with us, the following line is just for illustration diff --git a/R-package/man/print.xgb.Booster.Rd b/R-package/man/print.xgb.Booster.Rd index 9a783efaff27..fc055318cd01 100644 --- a/R-package/man/print.xgb.Booster.Rd +++ b/R-package/man/print.xgb.Booster.Rd @@ -21,9 +21,8 @@ Print information about \code{xgb.Booster}. data(agaricus.train, package = "xgboost") train <- agaricus.train -bst <- xgboost( - data = train$data, - label = train$label, +bst <- xgb.train( + data = xgb.DMatrix(train$data, label = train$label), max_depth = 2, eta = 1, nthread = 2, diff --git a/R-package/man/xgb.attr.Rd b/R-package/man/xgb.attr.Rd index 8038a2048b70..f23e9234018a 100644 --- a/R-package/man/xgb.attr.Rd +++ b/R-package/man/xgb.attr.Rd @@ -64,9 +64,8 @@ example of these behaviors). data(agaricus.train, package = "xgboost") train <- agaricus.train -bst <- xgboost( - data = train$data, - label = train$label, +bst <- xgb.train( + data = xgb.DMatrix(train$data, label = train$label), max_depth = 2, eta = 1, nthread = 2, diff --git a/R-package/man/xgb.config.Rd b/R-package/man/xgb.config.Rd index 1ab810644db9..dbad1d8cf043 100644 --- a/R-package/man/xgb.config.Rd +++ b/R-package/man/xgb.config.Rd @@ -35,9 +35,8 @@ nthread <- 1 data.table::setDTthreads(nthread) train <- agaricus.train -bst <- xgboost( - data = train$data, - label = train$label, +bst <- xgb.train( + data = xgb.DMatrix(train$data, label = train$label), max_depth = 2, eta = 1, nthread = nthread, diff --git a/R-package/man/xgb.dump.Rd b/R-package/man/xgb.dump.Rd index 6f97f69244b9..199ede1583f8 100644 --- a/R-package/man/xgb.dump.Rd +++ b/R-package/man/xgb.dump.Rd @@ -49,8 +49,8 @@ data(agaricus.train, package='xgboost') data(agaricus.test, package='xgboost') train <- agaricus.train test <- agaricus.test -bst <- xgboost(data = train$data, label = train$label, max_depth = 2, - eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") +bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2, + eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic") # save the model in file 'xgb.model.dump' dump_path = file.path(tempdir(), 'model.dump') xgb.dump(bst, dump_path, with_stats = TRUE) diff --git a/R-package/man/xgb.importance.Rd b/R-package/man/xgb.importance.Rd index 73b91e8b4b28..76574b9cbf06 100644 --- a/R-package/man/xgb.importance.Rd +++ b/R-package/man/xgb.importance.Rd @@ -70,9 +70,8 @@ be on the same scale (which is also recommended when using L1 or L2 regularizati # binomial classification using "gbtree": data(agaricus.train, package = "xgboost") -bst <- xgboost( - data = agaricus.train$data, - label = agaricus.train$label, +bst <- xgb.train( + data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), max_depth = 2, eta = 1, nthread = 2, @@ -83,9 +82,8 @@ bst <- xgboost( xgb.importance(model = bst) # binomial classification using "gblinear": -bst <- xgboost( - data = agaricus.train$data, - label = agaricus.train$label, +bst <- xgb.train( + data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), booster = "gblinear", eta = 0.3, nthread = 1, @@ -97,9 +95,11 @@ xgb.importance(model = bst) # multiclass classification using "gbtree": nclass <- 3 nrounds <- 10 -mbst <- xgboost( - data = as.matrix(iris[, -5]), - label = as.numeric(iris$Species) - 1, +mbst <- xgb.train( + data = xgb.DMatrix( + as.matrix(iris[, -5]), + label = as.numeric(iris$Species) - 1 + ), max_depth = 3, eta = 0.2, nthread = 2, @@ -123,9 +123,11 @@ xgb.importance( ) # multiclass classification using "gblinear": -mbst <- xgboost( - data = scale(as.matrix(iris[, -5])), - label = as.numeric(iris$Species) - 1, +mbst <- xgb.train( + data = xgb.DMatrix( + scale(as.matrix(iris[, -5])), + label = as.numeric(iris$Species) - 1 + ), booster = "gblinear", eta = 0.2, nthread = 1, diff --git a/R-package/man/xgb.model.dt.tree.Rd b/R-package/man/xgb.model.dt.tree.Rd index 75f1cd0f4f77..e9536767986c 100644 --- a/R-package/man/xgb.model.dt.tree.Rd +++ b/R-package/man/xgb.model.dt.tree.Rd @@ -63,9 +63,8 @@ data(agaricus.train, package = "xgboost") nthread <- 1 data.table::setDTthreads(nthread) -bst <- xgboost( - data = agaricus.train$data, - label = agaricus.train$label, +bst <- xgb.train( + data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), max_depth = 2, eta = 1, nthread = nthread, diff --git a/R-package/man/xgb.parameters.Rd b/R-package/man/xgb.parameters.Rd index 8d5044cab5cc..82977dc122d4 100644 --- a/R-package/man/xgb.parameters.Rd +++ b/R-package/man/xgb.parameters.Rd @@ -33,9 +33,8 @@ will reset its number of rounds indicator to zero. data(agaricus.train, package = "xgboost") train <- agaricus.train -bst <- xgboost( - data = train$data, - label = train$label, +bst <- xgb.train( + data = xgb.DMatrix(train$data, label = train$label), max_depth = 2, eta = 1, nthread = 2, diff --git a/R-package/man/xgb.plot.deepness.Rd b/R-package/man/xgb.plot.deepness.Rd index 43c0dac777f6..3da8e384e4a1 100644 --- a/R-package/man/xgb.plot.deepness.Rd +++ b/R-package/man/xgb.plot.deepness.Rd @@ -73,9 +73,8 @@ nthread <- 2 data.table::setDTthreads(nthread) ## Change max_depth to a higher number to get a more significant result -bst <- xgboost( - data = agaricus.train$data, - label = agaricus.train$label, +bst <- xgb.train( + data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), max_depth = 6, nthread = nthread, nrounds = 50, diff --git a/R-package/man/xgb.plot.importance.Rd b/R-package/man/xgb.plot.importance.Rd index e9c5930c2d57..a9ebcbd2732a 100644 --- a/R-package/man/xgb.plot.importance.Rd +++ b/R-package/man/xgb.plot.importance.Rd @@ -88,9 +88,8 @@ data(agaricus.train) nthread <- 2 data.table::setDTthreads(nthread) -bst <- xgboost( - data = agaricus.train$data, - label = agaricus.train$label, +bst <- xgb.train( + data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), max_depth = 3, eta = 1, nthread = nthread, diff --git a/R-package/man/xgb.plot.multi.trees.Rd b/R-package/man/xgb.plot.multi.trees.Rd index 7fa75c85d886..eae84d98edfd 100644 --- a/R-package/man/xgb.plot.multi.trees.Rd +++ b/R-package/man/xgb.plot.multi.trees.Rd @@ -67,9 +67,8 @@ data(agaricus.train, package = "xgboost") nthread <- 2 data.table::setDTthreads(nthread) -bst <- xgboost( - data = agaricus.train$data, - label = agaricus.train$label, +bst <- xgb.train( + data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label), max_depth = 15, eta = 1, nthread = nthread, diff --git a/R-package/man/xgb.plot.shap.Rd b/R-package/man/xgb.plot.shap.Rd index b460fa1fb3a6..f2d2ea2a05e6 100644 --- a/R-package/man/xgb.plot.shap.Rd +++ b/R-package/man/xgb.plot.shap.Rd @@ -135,9 +135,8 @@ nthread <- 1 data.table::setDTthreads(nthread) nrounds <- 20 -bst <- xgboost( - agaricus.train$data, - agaricus.train$label, +bst <- xgb.train( + data = xgb.DMatrix(agaricus.train$data, agaricus.train$label), nrounds = nrounds, eta = 0.1, max_depth = 3, @@ -161,9 +160,8 @@ x <- as.matrix(iris[, -5]) set.seed(123) is.na(x[sample(nrow(x) * 4, 30)]) <- TRUE # introduce some missing values -mbst <- xgboost( - data = x, - label = as.numeric(iris$Species) - 1, +mbst <- xgb.train( + data = xgb.DMatrix(x, label = as.numeric(iris$Species) - 1), nrounds = nrounds, max_depth = 2, eta = 0.3, diff --git a/R-package/man/xgb.plot.tree.Rd b/R-package/man/xgb.plot.tree.Rd index 69d37301dde6..6064107fc184 100644 --- a/R-package/man/xgb.plot.tree.Rd +++ b/R-package/man/xgb.plot.tree.Rd @@ -96,9 +96,8 @@ This function uses \href{https://www.graphviz.org/}{GraphViz} as DiagrammeR back \examples{ data(agaricus.train, package = "xgboost") -bst <- xgboost( - data = agaricus.train$data, - label = agaricus.train$label, +bst <- xgb.train( + data = xgb.DMatrix(agaricus.train$data, agaricus.train$label), max_depth = 3, eta = 1, nthread = 2, diff --git a/R-package/man/xgb.train.Rd b/R-package/man/xgb.train.Rd index f641b1374420..fc970e4fb493 100644 --- a/R-package/man/xgb.train.Rd +++ b/R-package/man/xgb.train.Rd @@ -1,8 +1,7 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/xgb.train.R, R/xgboost.R +% Please edit documentation in R/xgb.train.R \name{xgb.train} \alias{xgb.train} -\alias{xgboost} \title{eXtreme Gradient Boosting Training} \usage{ xgb.train( @@ -22,24 +21,6 @@ xgb.train( callbacks = list(), ... ) - -xgboost( - data = NULL, - label = NULL, - missing = NA, - weight = NULL, - params = list(), - nrounds, - verbose = 1, - print_every_n = 1L, - early_stopping_rounds = NULL, - maximize = NULL, - save_period = NULL, - save_name = "xgboost.model", - xgb_model = NULL, - callbacks = list(), - ... -) } \arguments{ \item{params}{the list of parameters. The complete list of parameters is @@ -240,15 +221,6 @@ to customize the training process. }\if{html}{\out{}}} \item{...}{other parameters to pass to \code{params}.} - -\item{label}{vector of response values. Should not be provided when data is -a local data file name or an \code{xgb.DMatrix}.} - -\item{missing}{by default is set to NA, which means that NA values should be considered as 'missing' -by the algorithm. Sometimes, 0 or other extreme value might be used to represent missing values. -This parameter is only used when input is a dense matrix.} - -\item{weight}{a vector indicating the weight for each row of the input.} } \value{ An object of class \code{xgb.Booster}. @@ -383,9 +355,8 @@ bst <- xgb.train(param, dtrain, nrounds = 25, evals = evals, early_stopping_rounds = 3) ## An 'xgboost' interface example: -bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, - max_depth = 2, eta = 1, nthread = nthread, nrounds = 2, - objective = "binary:logistic") +bst <- xgboost(x = agaricus.train$data, y = factor(agaricus.train$label), + params = list(max_depth = 2, eta = 1), nthread = nthread, nrounds = 2) pred <- predict(bst, agaricus.test$data) } diff --git a/R-package/man/xgboost.Rd b/R-package/man/xgboost.Rd new file mode 100644 index 000000000000..4af8f25ecc04 --- /dev/null +++ b/R-package/man/xgboost.Rd @@ -0,0 +1,213 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/xgboost.R +\name{xgboost} +\alias{xgboost} +\title{Fit XGBoost Model} +\usage{ +xgboost( + x, + y, + objective = NULL, + nrounds = 100L, + weights = NULL, + verbosity = 0L, + nthreads = parallel::detectCores(), + seed = 0L, + monotone_constraints = NULL, + interaction_constraints = NULL, + feature_weights = NULL, + base_margin = NULL, + ... +) +} +\arguments{ +\item{x}{The features / covariates. Can be passed as:\itemize{ +\item A numeric or integer `matrix`. +\item A `data.frame`, in which all columns are one of the following types:\itemize{ + \item `numeric` + \item `integer` + \item `logical` + \item `factor` +} + +Columns of `factor` type will be assumed to be categorical, while other column types will +be assumed to be numeric. +\item A sparse matrix from the `Matrix` package, either as `dgCMatrix` or `dgRMatrix` class. +} + +Note that categorical features are only supported for `data.frame` inputs, and are automatically +determined based on their types. See \link{xgb.train} with \link{xgb.DMatrix} for more flexible +variants that would allow something like categorical features on sparse matrices.} + +\item{y}{The response variable. Allowed values are:\itemize{ +\item A numeric or integer vector (for regression tasks). +\item A factor or character vector (for binary and multi-class classification tasks). +\item A logical (boolean) vector (for binary classification tasks). +\item A numeric or integer matrix or `data.frame` with numeric/integer columns +(for multi-task regression tasks). +\item A `Surv` object from the `survival` package (for survival tasks). +} + +If `objective` is `NULL`, the right task will be determined automatically based on +the class of `y`. + +If `objective` is not `NULL`, it must match with the type of `y` - e.g. `factor` types of `y` +can only be used with classification objectives and vice-versa. + +For binary classification, the last factor level of `y` will be used as the "positive" +class - that is, the numbers from `predict` will reflect the probabilities of belonging to this +class instead of to the first factor level. If `y` is a `logical` vector, then `TRUE` will be +set as the last level.} + +\item{objective}{Optimization objective to minimize based on the supplied data, to be passed +by name as a string / character (e.g. `reg:absoluteerror`). See the +\href{https://xgboost.readthedocs.io/en/stable/parameter.html#learning-task-parameters}{ +Learning Task Parameters} page for more detailed information on allowed values. + +If `NULL` (the default), will be automatically determined from `y` according to the following +logic:\itemize{ +\item If `y` is a factor with 2 levels, will use `binary:logistic`. +\item If `y` is a factor with more than 2 levels, will use `multi:softprob` (number of classes +will be determined automatically, should not be passed under `params`). +\item If `y` is a `Surv` object from the `survival` package, will use `survival:aft` (note that +the only types supported are left / right / interval censored). +\item Otherwise, will use `reg:squarederror`. +} + +If `objective` is not `NULL`, it must match with the type of `y` - e.g. `factor` types of `y` +can only be used with classification objectives and vice-versa. + +Note that not all possible `objective` values supported by the core XGBoost library are allowed +here - for example, objectives which are a variation of another but with a different default +prediction type (e.g. `multi:softmax` vs. `multi:softprob`) are not allowed, and neither are +ranking objectives, nor custom objectives at the moment.} + +\item{nrounds}{Number of boosting iterations / rounds. + +Note that the number of default boosting rounds here is not automatically tuned, and different +problems will have vastly different optimal numbers of boosting rounds.} + +\item{weights}{Sample weights for each row in `x` and `y`. If `NULL` (the default), each row +will have the same weight. + +If not `NULL`, should be passed as a numeric vector with length matching to the number of +rows in `x`.} + +\item{verbosity}{Verbosity of printing messages. Valid values of 0 (silent), 1 (warning), +2 (info), and 3 (debug).} + +\item{nthreads}{Number of parallel threads to use. If passing zero, will use all CPU threads.} + +\item{seed}{Seed to use for random number generation. If passing `NULL`, will draw a random +number using R's PRNG system to use as seed.} + +\item{monotone_constraints}{Optional monotonicity constraints for features. + +Can be passed either as a named list (when `x` has column names), or as a vector. If passed +as a vector and `x` has column names, will try to match the elements by name. + +A value of `+1` for a given feature makes the model predictions / scores constrained to be +a monotonically increasing function of that feature (that is, as the value of the feature +increases, the model prediction cannot decrease), while a value of `-1` makes it a monotonically +decreasing function. A value of zero imposes no constraint. + +The input for `monotone_constraints` can be a subset of the columns of `x` if named, in which +case the columns that are not referred to in `monotone_constraints` will be assumed to have +a value of zero (no constraint imposed on the model for those features). + +See the tutorial \href{https://xgboost.readthedocs.io/en/stable/tutorials/monotonic.html}{ +Monotonic Constraints} for a more detailed explanation.} + +\item{interaction_constraints}{Constraints for interaction representing permitted interactions. +The constraints must be specified in the form of a list of vectors referencing columns in the +data, e.g. `list(c(1, 2), c(3, 4, 5))` (with these numbers being column indices, numeration +starting at 1 - i.e. the first sublist references the first and second columns) or +`list(c("Sepal.Length", "Sepal.Width"), c("Petal.Length", "Petal.Width"))` (references +columns by names), where each vector is a group of indices of features that are allowed to +interact with each other. + +See the tutorial +\href{https://xgboost.readthedocs.io/en/stable/tutorials/feature_interaction_constraint.html}{ +Feature Interaction Constraints} for more information.} + +\item{feature_weights}{Feature weights for column sampling. + +Can be passed either as a vector with length matching to columns of `x`, or as a named +list (only if `x` has column names) with names matching to columns of 'x'. If it is a +named vector, will try to match the entries to column names of `x` by name. + +If `NULL` (the default), all columns will have the same weight.} + +\item{base_margin}{Base margin used for boosting from existing model. + +If passing it, will start the gradient boosting procedure from the scores that are provided +here - for example, one can pass the raw scores from a previous model, or some per-observation +offset, or similar. + +Should be either a numeric vector or numeric matrix (for multi-class and multi-target objectives) +with the same number of rows as `x` and number of columns corresponding to number of optimization +targets, and should be in the untransformed scale (for example, for objective `binary:logistic`, +it should have log-odds, not probabilities; and for objective `multi:softprob`, should have +number of columns matching to number of classes in the data). + +Note that, if it contains more than one column, then columns will not be matched by name to +the corresponding `y` - `base_margin` should have the same column order that the model will use +(for example, for objective `multi:softprob`, columns of `base_margin` will be matched against +`levels(y)` by their position, regardless of what `colnames(base_margin)` returns). + +If `NULL`, will start from zero, but note that for most objectives, an intercept is usually +added (controllable through parameter `base_score` instead) when `base_margin` is not passed.} + +\item{...}{Other training parameters. See the online documentation +\href{https://xgboost.readthedocs.io/en/stable/parameter.html}{XGBoost Parameters} for +details about possible values and what they do. + +Note that not all possible values from the core XGBoost library are allowed as `params` for +'xgboost()' - in particular, values which require an already-fitted booster object (such as +`process_type`) are not accepted here.} +} +\value{ +A model object, inheriting from both `xgboost` and `xgb.Booster`. Compared to the regular +`xgb.Booster` model class produced by \link{xgb.train}, this `xgboost` class will have an +additional attribute `metadata` containing information which is used for formatting prediction +outputs, such as class names for classification problems. +} +\description{ +Fits an XGBoost model (boosted decision tree ensemble) to given x/y data. + +See the tutorial \href{https://xgboost.readthedocs.io/en/stable/tutorials/model.html}{ +Introduction to Boosted Trees} for a longer explanation of what XGBoost does. + +This function is intended to provide a more user-friendly interface for XGBoost that follows +R's conventions for model fitting and predictions, but which doesn't expose all of the +possible functionalities of the core XGBoost library. + +See \link{xgb.train} for a more flexible low-level alternative which is similar across different +language bindings of XGBoost and which exposes the full library's functionalities. +} +\details{ +For package authors using `xgboost` as a dependency, it is highly recommended to use +\link{xgb.train} in package code instead of `xgboost()`, since it has a more stable interface +and performs fewer data conversions and copies along the way. +} +\examples{ +library(xgboost) +data(mtcars) + +# Fit a small regression model on the mtcars data +model_regression <- xgboost(mtcars[, -1], mtcars$mpg, nthreads = 1, nrounds = 3) +predict(model_regression, mtcars, validate_features = TRUE) + +# Task objective is determined automatically according to the type of 'y' +data(iris) +model_classif <- xgboost(iris[, -5], iris$Species, nthreads = 1, nrounds = 5) +predict(model_classif, iris, validate_features = TRUE) +} +\references{ +\itemize{ +\item Chen, Tianqi, and Carlos Guestrin. "Xgboost: A scalable tree boosting system." +Proceedings of the 22nd acm sigkdd international conference on knowledge discovery and +data mining. 2016. +\item \url{https://xgboost.readthedocs.io/en/stable/} +} +} diff --git a/R-package/tests/testthat/test_xgboost.R b/R-package/tests/testthat/test_xgboost.R new file mode 100644 index 000000000000..a4ac658a11b8 --- /dev/null +++ b/R-package/tests/testthat/test_xgboost.R @@ -0,0 +1,623 @@ +library(survival) +library(data.table) + +test_that("Auto determine objective", { + y_num <- seq(1, 10) + res_num <- process.y.margin.and.objective(y_num, NULL, NULL, NULL) + expect_equal(res_num$params$objective, "reg:squarederror") + + y_bin <- factor(c('a', 'b', 'a', 'b'), c('a', 'b')) + res_bin <- process.y.margin.and.objective(y_bin, NULL, NULL, NULL) + expect_equal(res_bin$params$objective, "binary:logistic") + + y_multi <- factor(c('a', 'b', 'a', 'b', 'c'), c('a', 'b', 'c')) + res_multi <- process.y.margin.and.objective(y_multi, NULL, NULL, NULL) + expect_equal(res_multi$params$objective, "multi:softprob") + + y_surv <- Surv(1:10, rep(c(0, 1), 5), type = "right") + res_surv <- process.y.margin.and.objective(y_surv, NULL, NULL, NULL) + expect_equal(res_surv$params$objective, "survival:aft") + + y_multicol <- matrix(seq(1, 20), nrow = 5) + res_multicol <- process.y.margin.and.objective(y_multicol, NULL, NULL, NULL) + expect_equal(res_multicol$params$objective, "reg:squarederror") +}) + +test_that("Process vectors", { + y <- seq(1, 10) + for (y_inp in list(as.integer(y), as.numeric(y))) { + res <- process.y.margin.and.objective(y_inp, NULL, "reg:pseudohubererror", NULL) + expect_equal( + res$dmatrix_args$label, + y + ) + expect_equal( + res$params$objective, + "reg:pseudohubererror" + ) + } +}) + +test_that("Process factors", { + y_bin <- factor(c('a', 'b', 'a', 'b'), c('a', 'b')) + expect_error({ + process.y.margin.and.objective(y_bin, NULL, "multi:softprob", NULL) + }) + for (bin_obj in c("binary:logistic", "binary:hinge")) { + for (y_inp in list(y_bin, as.ordered(y_bin))) { + res_bin <- process.y.margin.and.objective(y_inp, NULL, bin_obj, NULL) + expect_equal( + res_bin$dmatrix_args$label, + c(0, 1, 0, 1) + ) + expect_equal( + res_bin$metadata$y_levels, + c('a', 'b') + ) + expect_equal( + res_bin$params$objective, + bin_obj + ) + } + } + + y_bin2 <- factor(c(1, 0, 1, 0), c(1, 0)) + res_bin <- process.y.margin.and.objective(y_bin2, NULL, "binary:logistic", NULL) + expect_equal( + res_bin$dmatrix_args$label, + c(0, 1, 0, 1) + ) + expect_equal( + res_bin$metadata$y_levels, + c("1", "0") + ) + + y_bin3 <- c(TRUE, FALSE, TRUE) + res_bin <- process.y.margin.and.objective(y_bin3, NULL, "binary:logistic", NULL) + expect_equal( + res_bin$dmatrix_args$label, + c(1, 0, 1) + ) + expect_equal( + res_bin$metadata$y_levels, + c("FALSE", "TRUE") + ) + + y_multi <- factor(c('a', 'b', 'c', 'd', 'a', 'b'), c('a', 'b', 'c', 'd')) + expect_error({ + process.y.margin.and.objective(y_multi, NULL, "binary:logistic", NULL) + }) + expect_error({ + process.y.margin.and.objective(y_multi, NULL, "binary:logistic", NULL) + }) + res_multi <- process.y.margin.and.objective(y_multi, NULL, "multi:softprob", NULL) + expect_equal( + res_multi$dmatrix_args$label, + c(0, 1, 2, 3, 0, 1) + ) + expect_equal( + res_multi$metadata$y_levels, + c('a', 'b', 'c', 'd') + ) + expect_equal( + res_multi$params$num_class, + 4 + ) + expect_equal( + res_multi$params$objective, + "multi:softprob" + ) +}) + +test_that("Process survival objects", { + data(cancer, package = "survival") + y_right <- Surv(cancer$time, cancer$status - 1, type = "right") + res_cox <- process.y.margin.and.objective(y_right, NULL, "survival:cox", NULL) + expect_equal( + res_cox$dmatrix_args$label, + ifelse(cancer$status == 2, cancer$time, -cancer$time) + ) + expect_equal( + res_cox$params$objective, + "survival:cox" + ) + + res_aft <- process.y.margin.and.objective(y_right, NULL, "survival:aft", NULL) + expect_equal( + res_aft$dmatrix_args$label_lower_bound, + cancer$time + ) + expect_equal( + res_aft$dmatrix_args$label_upper_bound, + ifelse(cancer$status == 2, cancer$time, Inf) + ) + expect_equal( + res_aft$params$objective, + "survival:aft" + ) + + y_left <- Surv(seq(1, 4), c(1, 0, 1, 0), type = "left") + expect_error({ + process.y.margin.and.objective(y_left, NULL, "survival:cox", NULL) + }) + res_aft <- process.y.margin.and.objective(y_left, NULL, "survival:aft", NULL) + expect_equal( + res_aft$dmatrix_args$label_lower_bound, + c(1, 0, 3, 0) + ) + expect_equal( + res_aft$dmatrix_args$label_upper_bound, + seq(1, 4) + ) + expect_equal( + res_aft$params$objective, + "survival:aft" + ) + + y_interval <- Surv( + time = c(1, 5, 2, 10, 3), + time2 = c(2, 5, 2.5, 10, 3), + event = c(3, 1, 3, 0, 2), + type = "interval" + ) + expect_error({ + process.y.margin.and.objective(y_interval, NULL, "survival:cox", NULL) + }) + res_aft <- process.y.margin.and.objective(y_interval, NULL, "survival:aft", NULL) + expect_equal( + res_aft$dmatrix_args$label_lower_bound, + c(1, 5, 2, 10, 0) + ) + expect_equal( + res_aft$dmatrix_args$label_upper_bound, + c(2, 5, 2.5, Inf, 3) + ) + expect_equal( + res_aft$params$objective, + "survival:aft" + ) + + y_interval_neg <- Surv( + time = c(1, -5, 2, 10, 3), + time2 = c(2, -5, 2.5, 10, 3), + event = c(3, 1, 3, 0, 2), + type = "interval" + ) + expect_error({ + process.y.margin.and.objective(y_interval_neg, NULL, "survival:aft", NULL) + }) +}) + +test_that("Process multi-target", { + data(mtcars) + y_multi <- data.frame( + y1 = mtcars$mpg, + y2 = mtcars$mpg ^ 2 + ) + for (y_inp in list(y_multi, as.matrix(y_multi), data.table::as.data.table(y_multi))) { + res_multi <- process.y.margin.and.objective(y_inp, NULL, "reg:pseudohubererror", NULL) + expect_equal( + res_multi$dmatrix_args$label, + as.matrix(y_multi) + ) + expect_equal( + res_multi$metadata$y_names, + c("y1", "y2") + ) + expect_equal( + res_multi$params$objective, + "reg:pseudohubererror" + ) + } + + expect_error({ + process.y.margin.and.objective(y_multi, NULL, "count:poisson", NULL) + }) + + y_bad <- data.frame( + c1 = seq(1, 3), + c2 = rep(as.Date("2024-01-01"), 3) + ) + expect_error({ + process.y.margin.and.objective(y_bad, NULL, "reg:squarederror", NULL) + }) + + y_bad <- data.frame( + c1 = seq(1, 3), + c2 = factor(c('a', 'b', 'a'), c('a', 'b')) + ) + expect_error({ + process.y.margin.and.objective(y_bad, NULL, "reg:squarederror", NULL) + }) + + y_bad <- seq(1, 20) + dim(y_bad) <- c(5, 2, 2) + expect_error({ + process.y.margin.and.objective(y_bad, NULL, "reg:squarederror", NULL) + }) +}) + +test_that("Process base_margin", { + y <- seq(101, 110) + bm_good <- seq(1, 10) + for (bm in list(bm_good, as.matrix(bm_good), as.data.frame(as.matrix(bm_good)))) { + res <- process.y.margin.and.objective(y, bm, "reg:squarederror", NULL) + expect_equal( + res$dmatrix_args$base_margin, + seq(1, 10) + ) + } + expect_error({ + process.y.margin.and.objective(y, 5, "reg:squarederror", NULL) + }) + expect_error({ + process.y.margin.and.objective(y, seq(1, 5), "reg:squarederror", NULL) + }) + expect_error({ + process.y.margin.and.objective(y, matrix(seq(1, 20), ncol = 2), "reg:squarederror", NULL) + }) + expect_error({ + process.y.margin.and.objective( + y, + as.data.frame(matrix(seq(1, 20), ncol = 2)), + "reg:squarederror", + NULL + ) + }) + + y <- factor(c('a', 'b', 'c', 'a')) + bm_good <- matrix(seq(1, 12), ncol = 3) + for (bm in list(bm_good, as.data.frame(bm_good))) { + res <- process.y.margin.and.objective(y, bm, "multi:softprob", NULL) + expect_equal( + res$dmatrix_args$base_margin |> unname(), + matrix(seq(1, 12), ncol = 3) + ) + } + expect_error({ + process.y.margin.and.objective(y, as.numeric(bm_good), "multi:softprob", NULL) + }) + expect_error({ + process.y.margin.and.objective(y, 5, "multi:softprob", NULL) + }) + expect_error({ + process.y.margin.and.objective(y, bm_good[, 1], "multi:softprob", NULL) + }) + expect_error({ + process.y.margin.and.objective(y, bm_good[, c(1, 2)], "multi:softprob", NULL) + }) + expect_error({ + process.y.margin.and.objective(y, bm_good[c(1, 2), ], "multi:softprob", NULL) + }) + + y <- seq(101, 110) + bm_good <- matrix(seq(1, 30), ncol = 3) + params <- list(quantile_alpha = c(0.1, 0.5, 0.9)) + for (bm in list(bm_good, as.data.frame(bm_good))) { + res <- process.y.margin.and.objective(y, bm, "reg:quantileerror", params) + expect_equal( + res$dmatrix_args$base_margin |> unname(), + matrix(seq(1, 30), ncol = 3) + ) + } + expect_error({ + process.y.margin.and.objective(y, as.numeric(bm_good), "reg:quantileerror", params) + }) + expect_error({ + process.y.margin.and.objective(y, 5, "reg:quantileerror", params) + }) + expect_error({ + process.y.margin.and.objective(y, bm_good[, 1], "reg:quantileerror", params) + }) + expect_error({ + process.y.margin.and.objective(y, bm_good[, c(1, 2)], "reg:quantileerror", params) + }) + expect_error({ + process.y.margin.and.objective(y, bm_good[c(1, 2, 3), ], "reg:quantileerror", params) + }) + + y <- matrix(seq(101, 130), ncol = 3) + for (bm in list(bm_good, as.data.frame(bm_good))) { + res <- process.y.margin.and.objective(y, bm, "reg:squarederror", params) + expect_equal( + res$dmatrix_args$base_margin |> unname(), + matrix(seq(1, 30), ncol = 3) + ) + } + expect_error({ + process.y.margin.and.objective(y, as.numeric(bm_good), "reg:squarederror", params) + }) + expect_error({ + process.y.margin.and.objective(y, 5, "reg:squarederror", params) + }) + expect_error({ + process.y.margin.and.objective(y, bm_good[, 1], "reg:squarederror", params) + }) + expect_error({ + process.y.margin.and.objective(y, bm_good[, c(1, 2)], "reg:squarederror", params) + }) + expect_error({ + process.y.margin.and.objective(y, bm_good[c(1, 2, 3), ], "reg:squarederror", params) + }) +}) + +test_that("Process monotone constraints", { + data(iris) + mc_list <- list(Sepal.Width = 1) + res <- process.x.and.col.args( + iris, + monotone_constraints = mc_list, + interaction_constraints = NULL, + feature_weights = NULL, + lst_args = list(), + use_qdm = FALSE + ) + expect_equal( + res$params$monotone_constraints, + c(0, 1, 0, 0, 0) + ) + + mc_list2 <- list(Sepal.Width = 1, Petal.Width = -1) + res <- process.x.and.col.args( + iris, + monotone_constraints = mc_list2, + interaction_constraints = NULL, + feature_weights = NULL, + lst_args = list(), + use_qdm = FALSE + ) + expect_equal( + res$params$monotone_constraints, + c(0, 1, 0, -1, 0) + ) + + mc_vec <- c(0, 1, -1, 0, 0) + res <- process.x.and.col.args( + iris, + monotone_constraints = mc_vec, + interaction_constraints = NULL, + feature_weights = NULL, + lst_args = list(), + use_qdm = FALSE + ) + expect_equal( + res$params$monotone_constraints, + c(0, 1, -1, 0, 0) + ) + + mc_named_vec <- c(1, 1) + names(mc_named_vec) <- names(iris)[1:2] + res <- process.x.and.col.args( + iris, + monotone_constraints = mc_named_vec, + interaction_constraints = NULL, + feature_weights = NULL, + lst_args = list(), + use_qdm = FALSE + ) + expect_equal( + res$params$monotone_constraints, + c(1, 1, 0, 0, 0) + ) + + mc_named_all <- c(0, -1, 1, 0, -1) + names(mc_named_all) <- rev(names(iris)) + res <- process.x.and.col.args( + iris, + monotone_constraints = mc_named_all, + interaction_constraints = NULL, + feature_weights = NULL, + lst_args = list(), + use_qdm = FALSE + ) + expect_equal( + res$params$monotone_constraints, + rev(mc_named_all) |> unname() + ) + + expect_error({ + process.x.and.col.args( + iris, + monotone_constraints = list( + Sepal.Width = 1, + Petal.Width = -1, + Sepal.Width = -1 + ), + interaction_constraints = NULL, + feature_weights = NULL, + lst_args = list(), + use_qdm = FALSE + ) + }) + + expect_error({ + process.x.and.col.args( + iris, + monotone_constraints = rep(0, 6), + interaction_constraints = NULL, + feature_weights = NULL, + lst_args = list(), + use_qdm = FALSE + ) + }) +}) + +test_that("Process interaction_constraints", { + data(iris) + res <- process.x.and.col.args(iris, NULL, list(c(1L, 2L)), NULL, NULL, FALSE) + expect_equal( + res$params$interaction_constraints, + list(c(0, 1)) + ) + res <- process.x.and.col.args(iris, NULL, list(c(1.0, 2.0)), NULL, NULL, FALSE) + expect_equal( + res$params$interaction_constraints, + list(c(0, 1)) + ) + res <- process.x.and.col.args(iris, NULL, list(c(1, 2), c(3, 4)), NULL, NULL, FALSE) + expect_equal( + res$params$interaction_constraints, + list(c(0, 1), c(2, 3)) + ) + res <- process.x.and.col.args( + iris, NULL, list(c("Sepal.Length", "Sepal.Width")), NULL, NULL, FALSE + ) + expect_equal( + res$params$interaction_constraints, + list(c(0, 1)) + ) + res <- process.x.and.col.args( + as.matrix(iris), + NULL, + list(c("Sepal.Length", "Sepal.Width")), + NULL, + NULL, + FALSE + ) + expect_equal( + res$params$interaction_constraints, + list(c(0, 1)) + ) + res <- process.x.and.col.args( + iris, + NULL, + list(c("Sepal.Width", "Petal.Length"), c("Sepal.Length", "Petal.Width", "Species")), + NULL, + NULL, + FALSE + ) + expect_equal( + res$params$interaction_constraints, + list(c(1, 2), c(0, 3, 4)) + ) + + expect_error({ + process.x.and.col.args(iris, NULL, list(c(1L, 20L)), NULL, NULL, FALSE) + }) + expect_error({ + process.x.and.col.args(iris, NULL, list(c(0L, 2L)), NULL, NULL, FALSE) + }) + expect_error({ + process.x.and.col.args(iris, NULL, list(c("1", "2")), NULL, NULL, FALSE) + }) + expect_error({ + process.x.and.col.args(iris, NULL, list(c("Sepal", "Petal")), NULL, NULL, FALSE) + }) + expect_error({ + process.x.and.col.args(iris, NULL, c(1L, 2L), NULL, NULL, FALSE) + }) + expect_error({ + process.x.and.col.args(iris, NULL, matrix(c(1L, 2L)), NULL, NULL, FALSE) + }) + expect_error({ + process.x.and.col.args(iris, NULL, list(c(1, 2.5)), NULL, NULL, FALSE) + }) +}) + +test_that("Sparse matrices are casted to CSR for QDM", { + data(agaricus.test, package = "xgboost") + x <- agaricus.test$data + for (x_in in list(x, methods::as(x, "TsparseMatrix"))) { + res <- process.x.and.col.args( + x_in, + NULL, + NULL, + NULL, + NULL, + TRUE + ) + expect_s4_class(res$dmatrix_args$data, "dgRMatrix") + } +}) + +test_that("Process feature_weights", { + data(iris) + w_vector <- seq(1, 5) + res <- process.x.and.col.args( + iris, + monotone_constraints = NULL, + interaction_constraints = NULL, + feature_weights = w_vector, + lst_args = list(), + use_qdm = FALSE + ) + expect_equal( + res$dmatrix_args$feature_weights, + seq(1, 5) + ) + + w_named_vector <- seq(1, 5) + names(w_named_vector) <- rev(names(iris)) + res <- process.x.and.col.args( + iris, + monotone_constraints = NULL, + interaction_constraints = NULL, + feature_weights = w_named_vector, + lst_args = list(), + use_qdm = FALSE + ) + expect_equal( + res$dmatrix_args$feature_weights, + rev(seq(1, 5)) + ) + + w_list <- list( + Species = 5, + Sepal.Length = 1, + Sepal.Width = 2, + Petal.Length = 3, + Petal.Width = 4 + ) + res <- process.x.and.col.args( + iris, + monotone_constraints = NULL, + interaction_constraints = NULL, + feature_weights = w_list, + lst_args = list(), + use_qdm = FALSE + ) + expect_equal( + res$dmatrix_args$feature_weights, + seq(1, 5) + ) +}) + +test_that("Whole function works", { + data(cancer, package = "survival") + y <- Surv(cancer$time, cancer$status - 1, type = "right") + x <- as.data.table(cancer)[, -c("time", "status")] + model <- xgboost( + x, + y, + monotone_constraints = list(age = -1), + nthreads = 1L, + nrounds = 5L, + eta = 3 + ) + expect_equal( + attributes(model)$params$objective, + "survival:aft" + ) + expect_equal( + attributes(model)$metadata$n_targets, + 1L + ) + expect_equal( + attributes(model)$params$monotone_constraints, + "(0,-1,0,0,0,0,0,0)" + ) + expect_false( + "interaction_constraints" %in% names(attributes(model)$params) + ) + expect_equal( + attributes(model)$params$eta, + 3 + ) + txt <- capture.output({ + print(model) + }) + expect_true(any(grepl("Objective: survival:aft", txt, fixed = TRUE))) + expect_true(any(grepl("monotone_constraints", txt, fixed = TRUE))) + expect_true(any(grepl("Number of iterations: 5", txt, fixed = TRUE))) + expect_true(any(grepl("Number of features: 8", txt, fixed = TRUE))) +}) diff --git a/R-package/vignettes/discoverYourData.Rmd b/R-package/vignettes/discoverYourData.Rmd index 4b04f771f210..8347d0ee0a84 100644 --- a/R-package/vignettes/discoverYourData.Rmd +++ b/R-package/vignettes/discoverYourData.Rmd @@ -173,8 +173,9 @@ Build the model The code below is very usual. For more information, you can look at the documentation of `xgboost` function (or at the vignette [XGBoost presentation](https://github.com/dmlc/xgboost/blob/master/R-package/vignettes/xgboostPresentation.Rmd)). ```{r} -bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 4, - eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic") +bst <- xgboost(x = sparse_matrix, y = output_vector, + params = list(max_depth = 4, eta = 1), + nthread = 2, nrounds = 10) ``` @@ -299,28 +300,28 @@ test <- agaricus.test #Random Forest - 1000 trees bst <- xgboost( - data = train$data, - label = train$label, - max_depth = 4, - num_parallel_tree = 1000, - subsample = 0.5, - colsample_bytree = 0.5, + x = train$data, + y = factor(train$label, levels = c(0, 1)), + params = list( + max_depth = 4, + num_parallel_tree = 1000, + subsample = 0.5, + colsample_bytree = 0.5 + ), nrounds = 1, - objective = "binary:logistic", nthread = 2 ) #Boosting - 3 rounds bst <- xgboost( - data = train$data, - label = train$label, - max_depth = 4, + x = train$data, + y = factor(train$label, levels = c(0, 1)), + params = list(max_depth = 4), nrounds = 3, - objective = "binary:logistic", nthread = 2 ) ``` -> Note that the parameter `round` is set to `1`. +> Note that the parameter `nrounds` is set to `1`. > [**Random Forests**](https://www.stat.berkeley.edu/~breiman/RandomForests/cc_papers.htm) is a trademark of Leo Breiman and Adele Cutler and is licensed exclusively to Salford Systems for the commercial release of the software. diff --git a/R-package/vignettes/xgboostPresentation.Rmd b/R-package/vignettes/xgboostPresentation.Rmd index fc49adc0fcee..d1ca4f2879a7 100644 --- a/R-package/vignettes/xgboostPresentation.Rmd +++ b/R-package/vignettes/xgboostPresentation.Rmd @@ -146,22 +146,19 @@ In a *sparse* matrix, cells containing `0` are not stored in memory. Therefore, We will train decision tree model using the following parameters: -* `objective = "binary:logistic"`: we will train a binary classification model ; +* `objective = "binary:logistic"`: we will train a binary classification model (note that this is set automatically when `y` is a `factor`) ; * `max_depth = 2`: the trees won't be deep, because our case is very simple ; * `nthread = 2`: the number of CPU threads we are going to use; * `nrounds = 2`: there will be two passes on the data, the second one will enhance the model by further reducing the difference between ground truth and prediction. ```{r trainingSparse, message=F, warning=F} bstSparse <- xgboost( - data = train$data - , label = train$label - , params = list( - max_depth = 2 - , eta = 1 - , nthread = 2 - , objective = "binary:logistic" - ) + x = train$data + , y = factor(train$label, levels = c(0, 1)) + , objective = "binary:logistic" + , params = list(max_depth = 2, eta = 1) , nrounds = 2 + , nthread = 2 ) ``` @@ -175,15 +172,11 @@ Alternatively, you can put your dataset in a *dense* matrix, i.e. a basic **R** ```{r trainingDense, message=F, warning=F} bstDense <- xgboost( - data = as.matrix(train$data), - label = train$label, - params = list( - max_depth = 2, - eta = 1, - nthread = 2, - objective = "binary:logistic" - ), - nrounds = 2 + x = as.matrix(train$data), + y = factor(train$label, levels = c(0, 1)), + params = list(max_depth = 2, eta = 1), + nrounds = 2, + nthread = 2 ) ``` @@ -193,7 +186,7 @@ bstDense <- xgboost( ```{r trainingDmatrix, message=F, warning=F} dtrain <- xgb.DMatrix(data = train$data, label = train$label, nthread = 2) -bstDMatrix <- xgboost( +bstDMatrix <- xgb.train( data = dtrain, params = list( max_depth = 2, @@ -213,7 +206,7 @@ One of the simplest way to see the training progress is to set the `verbose` opt ```{r trainingVerbose0, message=T, warning=F} # verbose = 0, no message -bst <- xgboost( +bst <- xgb.train( data = dtrain , params = list( max_depth = 2 @@ -228,7 +221,7 @@ bst <- xgboost( ```{r trainingVerbose1, message=T, warning=F} # verbose = 1, print evaluation metric -bst <- xgboost( +bst <- xgb.train( data = dtrain , params = list( max_depth = 2 @@ -243,7 +236,7 @@ bst <- xgboost( ```{r trainingVerbose2, message=T, warning=F} # verbose = 2, also print information about tree -bst <- xgboost( +bst <- xgb.train( data = dtrain , params = list( max_depth = 2 diff --git a/doc/tutorials/feature_interaction_constraint.rst b/doc/tutorials/feature_interaction_constraint.rst index b3d655584b95..7f26cd437325 100644 --- a/doc/tutorials/feature_interaction_constraint.rst +++ b/doc/tutorials/feature_interaction_constraint.rst @@ -178,9 +178,10 @@ parameter: Using feature name instead ************************** -XGBoost's Python package supports using feature names instead of feature index for +XGBoost's Python and R packages support using feature names instead of feature index for specifying the constraints. Given a data frame with columns ``["f0", "f1", "f2"]``, the -feature interaction constraint can be specified as ``[["f0", "f2"]]``. +feature interaction constraint can be specified as ``[["f0", "f2"]]`` (Python) or +``list(c("f0", "f2"))`` (R, when passing them to function ``xgboost()``). ************** Advanced topic diff --git a/doc/tutorials/monotonic.rst b/doc/tutorials/monotonic.rst index e663d1109689..6868e0a56037 100644 --- a/doc/tutorials/monotonic.rst +++ b/doc/tutorials/monotonic.rst @@ -97,7 +97,8 @@ Some other examples: Using feature names ******************* -XGBoost's Python package supports using feature names instead of feature index for +XGBoost's Python and R packages support using feature names instead of feature indices for specifying the constraints. Given a data frame with columns ``["f0", "f1", "f2"]``, the -monotonic constraint can be specified as ``{"f0": 1, "f2": -1}``, and ``"f1"`` will +monotonic constraint can be specified as ``{"f0": 1, "f2": -1}`` (Python) or as +``list(f0=1, f2=-1)`` (R, when using 'xgboost()', but not 'xgb.train'), and ``"f1"`` will default to ``0`` (no constraint). diff --git a/include/xgboost/host_device_vector.h b/include/xgboost/host_device_vector.h index 9a53d38583ca..36c7ed32b83d 100644 --- a/include/xgboost/host_device_vector.h +++ b/include/xgboost/host_device_vector.h @@ -135,7 +135,9 @@ class HostDeviceVector { void SetDevice(DeviceOrd device) const; - void Resize(size_t new_size, T v = T()); + void Resize(std::size_t new_size); + /** @brief Resize and initialize the data if the new size is larger than the old size. */ + void Resize(std::size_t new_size, T v); using value_type = T; // NOLINT diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml index c6f0fe45ef6c..fb6aaf020c3e 100644 --- a/jvm-packages/pom.xml +++ b/jvm-packages/pom.xml @@ -33,7 +33,7 @@ UTF-8 1.8 1.8 - 1.19.0 + 1.19.1 4.13.2 3.5.1 3.5.1 @@ -125,7 +125,7 @@ org.apache.maven.plugins maven-jar-plugin - 3.4.1 + 3.4.2 empty-javadoc-jar @@ -143,7 +143,7 @@ org.apache.maven.plugins maven-release-plugin - 3.0.1 + 3.1.1 true false @@ -394,7 +394,7 @@ org.apache.maven.plugins maven-checkstyle-plugin - 3.3.1 + 3.4.0 checkstyle.xml true @@ -412,7 +412,7 @@ net.alchim31.maven scala-maven-plugin - 4.9.1 + 4.9.2 compile @@ -447,7 +447,7 @@ org.apache.maven.plugins maven-surefire-plugin - 3.2.5 + 3.3.1 false false @@ -470,12 +470,12 @@ maven-project-info-reports-plugin - 3.5.0 + 3.6.2 net.alchim31.maven scala-maven-plugin - 4.9.1 + 4.9.2 -Xms64m @@ -494,7 +494,7 @@ commons-logging commons-logging - 1.3.2 + 1.3.3 org.scalatest diff --git a/jvm-packages/xgboost4j/pom.xml b/jvm-packages/xgboost4j/pom.xml index acb724050f13..345098327f5c 100644 --- a/jvm-packages/xgboost4j/pom.xml +++ b/jvm-packages/xgboost4j/pom.xml @@ -106,7 +106,7 @@ org.apache.maven.plugins maven-jar-plugin - 3.4.1 + 3.4.2 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 49290179a899..51ce25607fa0 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -9,7 +9,6 @@ target_sources(objxgboost PRIVATE ${CPU_SOURCES}) set_source_files_properties( predictor/predictor.cc gbm/gbm.cc tree/tree_updater.cc metric/metric.cc objective/objective.cc PROPERTIES SKIP_UNITY_BUILD_INCLUSION ON) -target_sources(objxgboost PRIVATE ${RABIT_SOURCES}) if(USE_CUDA) file(GLOB_RECURSE CUDA_SOURCES *.cu *.cuh) @@ -23,8 +22,7 @@ endif() target_include_directories(objxgboost PRIVATE ${xgboost_SOURCE_DIR}/include - ${xgboost_SOURCE_DIR}/dmlc-core/include - ${xgboost_SOURCE_DIR}/rabit/include) + ${xgboost_SOURCE_DIR}/dmlc-core/include) if(LOG_CAPI_INVOCATION) target_compile_definitions(objxgboost PRIVATE -DLOG_CAPI_INVOCATION=1) diff --git a/src/common/cuda_context.cuh b/src/common/cuda_context.cuh index c8b2e07927c9..7e1db8e3bf2f 100644 --- a/src/common/cuda_context.cuh +++ b/src/common/cuda_context.cuh @@ -18,7 +18,7 @@ struct CUDAContext { * \brief Caching thrust policy. */ auto CTP() const { -#if THRUST_MAJOR_VERSION >= 2 +#if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM) return thrust::cuda::par_nosync(caching_alloc_).on(dh::DefaultStream()); #else return thrust::cuda::par(caching_alloc_).on(dh::DefaultStream()); diff --git a/src/common/device_helpers.cuh b/src/common/device_helpers.cuh index 7cd00f6f6112..1754c9507036 100644 --- a/src/common/device_helpers.cuh +++ b/src/common/device_helpers.cuh @@ -1,26 +1,21 @@ /** - * Copyright 2017-2023 XGBoost contributors + * Copyright 2017-2024, XGBoost contributors */ #pragma once -#include // thrust::upper_bound -#include -#include -#include +#include // thrust::upper_bound +#include // for device_ptr +#include // for device_vector #include // thrust::seq -#include // gather -#include +#include // for discard_iterator #include // make_transform_output_iterator -#include -#include #include #include -#include #include #include #include // for size_t #include -#include +#include // for UnitWord #include #include #include @@ -28,22 +23,14 @@ #include "../collective/communicator-inl.h" #include "common.h" +#include "device_vector.cuh" #include "xgboost/host_device_vector.h" #include "xgboost/logging.h" #include "xgboost/span.h" -#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 -#include "rmm/mr/device/per_device_resource.hpp" -#include "rmm/mr/device/thrust_allocator_adaptor.hpp" -#include "rmm/version_config.hpp" - -#if !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR) -#error "Please use RMM version 0.18 or later" -#elif RMM_VERSION_MAJOR == 0 && RMM_VERSION_MINOR < 18 -#error "Please use RMM version 0.18 or later" -#endif // !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR) - -#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 +#if defined(XGBOOST_USE_RMM) +#include +#endif // defined(XGBOOST_USE_RMM) #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600 || defined(__clang__) @@ -285,91 +272,6 @@ void Iota(Container array, cudaStream_t stream) { LaunchN(array.size(), stream, [=] __device__(size_t i) { array[i] = i; }); } -namespace detail { -/** \brief Keeps track of global device memory allocations. Thread safe.*/ -class MemoryLogger { - // Information for a single device - struct DeviceStats { - size_t currently_allocated_bytes{ 0 }; - size_t peak_allocated_bytes{ 0 }; - size_t num_allocations{ 0 }; - size_t num_deallocations{ 0 }; - std::map device_allocations; - void RegisterAllocation(void *ptr, size_t n) { - device_allocations[ptr] = n; - currently_allocated_bytes += n; - peak_allocated_bytes = std::max(peak_allocated_bytes, currently_allocated_bytes); - num_allocations++; - CHECK_GT(num_allocations, num_deallocations); - } - void RegisterDeallocation(void *ptr, size_t n, int current_device) { - auto itr = device_allocations.find(ptr); - if (itr == device_allocations.end()) { - LOG(WARNING) << "Attempting to deallocate " << n << " bytes on device " << current_device - << " that was never allocated\n" - << dmlc::StackTrace(); - } else { - num_deallocations++; - CHECK_LE(num_deallocations, num_allocations); - currently_allocated_bytes -= itr->second; - device_allocations.erase(itr); - } - } - }; - DeviceStats stats_; - std::mutex mutex_; - -public: - void RegisterAllocation(void *ptr, size_t n) { - if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) { - return; - } - std::lock_guard guard(mutex_); - int current_device; - safe_cuda(cudaGetDevice(¤t_device)); - stats_.RegisterAllocation(ptr, n); - } - void RegisterDeallocation(void *ptr, size_t n) { - if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) { - return; - } - std::lock_guard guard(mutex_); - int current_device; - safe_cuda(cudaGetDevice(¤t_device)); - stats_.RegisterDeallocation(ptr, n, current_device); - } - size_t PeakMemory() const { - return stats_.peak_allocated_bytes; - } - size_t CurrentlyAllocatedBytes() const { - return stats_.currently_allocated_bytes; - } - void Clear() - { - stats_ = DeviceStats(); - } - - void Log() { - if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) { - return; - } - std::lock_guard guard(mutex_); - int current_device; - safe_cuda(cudaGetDevice(¤t_device)); - LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: " - << " ========"; - LOG(CONSOLE) << "Peak memory usage: " - << stats_.peak_allocated_bytes / 1048576 << "MiB"; - LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations; - } -}; -} // namespace detail - -inline detail::MemoryLogger &GlobalMemoryLogger() { - static detail::MemoryLogger memory_logger; - return memory_logger; -} - // dh::DebugSyncDevice(__FILE__, __LINE__); inline void DebugSyncDevice(std::string file="", int32_t line = -1) { if (file != "" && line != -1) { @@ -380,134 +282,6 @@ inline void DebugSyncDevice(std::string file="", int32_t line = -1) { safe_cuda(cudaGetLastError()); } -namespace detail { - -#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 -template -using XGBBaseDeviceAllocator = rmm::mr::thrust_allocator; -#else // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 -template -using XGBBaseDeviceAllocator = thrust::device_malloc_allocator; -#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 - -inline void ThrowOOMError(std::string const& err, size_t bytes) { - auto device = CurrentDevice(); - auto rank = xgboost::collective::GetRank(); - std::stringstream ss; - ss << "Memory allocation error on worker " << rank << ": " << err << "\n" - << "- Free memory: " << AvailableMemory(device) << "\n" - << "- Requested memory: " << bytes << std::endl; - LOG(FATAL) << ss.str(); -} - -/** - * \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose. - */ -template -struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator { - using SuperT = XGBBaseDeviceAllocator; - using pointer = thrust::device_ptr; // NOLINT - template - struct rebind // NOLINT - { - using other = XGBDefaultDeviceAllocatorImpl; // NOLINT - }; - pointer allocate(size_t n) { // NOLINT - pointer ptr; - try { - ptr = SuperT::allocate(n); - dh::safe_cuda(cudaGetLastError()); - } catch (const std::exception &e) { - ThrowOOMError(e.what(), n * sizeof(T)); - } - GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T)); - return ptr; - } - void deallocate(pointer ptr, size_t n) { // NOLINT - GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T)); - SuperT::deallocate(ptr, n); - } -#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 - XGBDefaultDeviceAllocatorImpl() - : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()) {} -#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 -}; - -/** - * \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end, unless - * RMM pool allocator is enabled. Does not initialise memory on construction. - */ -template -struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator { - using SuperT = XGBBaseDeviceAllocator; - using pointer = thrust::device_ptr; // NOLINT - template - struct rebind // NOLINT - { - using other = XGBCachingDeviceAllocatorImpl; // NOLINT - }; - cub::CachingDeviceAllocator& GetGlobalCachingAllocator() { - // Configure allocator with maximum cached bin size of ~1GB and no limit on - // maximum cached bytes - thread_local std::unique_ptr allocator{ - std::make_unique(2, 9, 29)}; - return *allocator; - } - pointer allocate(size_t n) { // NOLINT - pointer thrust_ptr; - if (use_cub_allocator_) { - T* raw_ptr{nullptr}; - auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast(&raw_ptr), - n * sizeof(T)); - if (errc != cudaSuccess) { - ThrowOOMError("Caching allocator", n * sizeof(T)); - } - thrust_ptr = pointer(raw_ptr); - } else { - try { - thrust_ptr = SuperT::allocate(n); - dh::safe_cuda(cudaGetLastError()); - } catch (const std::exception &e) { - ThrowOOMError(e.what(), n * sizeof(T)); - } - } - GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T)); - return thrust_ptr; - } - void deallocate(pointer ptr, size_t n) { // NOLINT - GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T)); - if (use_cub_allocator_) { - GetGlobalCachingAllocator().DeviceFree(ptr.get()); - } else { - SuperT::deallocate(ptr, n); - } - } -#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 - XGBCachingDeviceAllocatorImpl() - : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()), - use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {} -#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 - XGBOOST_DEVICE void construct(T *) {} // NOLINT - private: - bool use_cub_allocator_{true}; -}; -} // namespace detail - -// Declare xgboost allocators -// Replacement of allocator with custom backend should occur here -template -using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl; -/*! Be careful that the initialization constructor is a no-op, which means calling - * `vec.resize(n)` won't initialize the memory region to 0. Instead use - * `vec.resize(n, 0)`*/ -template -using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl; -/** \brief Specialisation of thrust device vector using custom allocator. */ -template -using device_vector = thrust::device_vector>; // NOLINT -template -using caching_device_vector = thrust::device_vector>; // NOLINT - // Faster to instantiate than caching_device_vector and invokes no synchronisation // Use this where vector functionality (e.g. resize) is not required template @@ -734,6 +508,11 @@ xgboost::common::Span ToSpan(thrust::device_vector& vec, return ToSpan(vec, offset, size); } +template +xgboost::common::Span ToSpan(DeviceUVector &vec) { + return {thrust::raw_pointer_cast(vec.data()), vec.size()}; +} + // thrust begin, similiar to std::begin template thrust::device_ptr tbegin(xgboost::HostDeviceVector& vector) { // NOLINT @@ -1117,6 +896,15 @@ class CUDAStream { void Sync() { this->View().Sync(); } }; +inline auto CachingThrustPolicy() { + XGBCachingDeviceAllocator alloc; +#if THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM) + return thrust::cuda::par_nosync(alloc).on(DefaultStream()); +#else + return thrust::cuda::par(alloc).on(DefaultStream()); +#endif // THRUST_MAJOR_VERSION >= 2 || defined(XGBOOST_USE_RMM) +} + // Force nvcc to load data as constant template class LDGIterator { diff --git a/src/common/device_vector.cu b/src/common/device_vector.cu new file mode 100644 index 000000000000..50922d8f978e --- /dev/null +++ b/src/common/device_vector.cu @@ -0,0 +1,27 @@ +/** + * Copyright 2017-2024, XGBoost contributors + */ +#include "../collective/communicator-inl.h" // for GetRank +#include "device_helpers.cuh" // for CurrentDevice +#include "device_vector.cuh" + +namespace dh { +namespace detail { +void ThrowOOMError(std::string const &err, size_t bytes) { + auto device = CurrentDevice(); + auto rank = xgboost::collective::GetRank(); + std::stringstream ss; + ss << "Memory allocation error on worker " << rank << ": " << err << "\n" + << "- Free memory: " << dh::AvailableMemory(device) << "\n" + << "- Requested memory: " << bytes << std::endl; + LOG(FATAL) << ss.str(); +} +} // namespace detail + +#if defined(XGBOOST_USE_RMM) +LoggingResource *GlobalLoggingResource() { + static auto mr{std::make_unique()}; + return mr.get(); +} +#endif // defined(XGBOOST_USE_RMM) +} // namespace dh diff --git a/src/common/device_vector.cuh b/src/common/device_vector.cuh new file mode 100644 index 000000000000..35386856cc9c --- /dev/null +++ b/src/common/device_vector.cuh @@ -0,0 +1,330 @@ +/** + * Copyright 2017-2024, XGBoost Contributors + */ +#pragma once +#include // for device_malloc_allocator +#include // for device_ptr +#include // for device_vector + +#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 +#include // for device_uvector +#include // for exec_policy_nosync +#include // for device_memory_resource +#include // for get_current_device_resource +#include // for thrust_allocator +#include // for RMM_VERSION_MAJOR + +#include "xgboost/global_config.h" // for GlobalConfigThreadLocalStore + +#if !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR) + +#error "Please use RMM version 0.18 or later" +#elif RMM_VERSION_MAJOR == 0 && RMM_VERSION_MINOR < 18 +#error "Please use RMM version 0.18 or later" +#endif // !defined(RMM_VERSION_MAJOR) || !defined(RMM_VERSION_MINOR) + +#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 + +#include // for size_t +#include // for CachingDeviceAllocator +#include // for CurrentDevice +#include // for map +#include // for unique_ptr + +#include "common.h" // for safe_cuda +#include "xgboost/logging.h" + +namespace dh { +namespace detail { +/** \brief Keeps track of global device memory allocations. Thread safe.*/ +class MemoryLogger { + // Information for a single device + struct DeviceStats { + std::size_t currently_allocated_bytes{0}; + size_t peak_allocated_bytes{0}; + size_t num_allocations{0}; + size_t num_deallocations{0}; + std::map device_allocations; + void RegisterAllocation(void *ptr, size_t n) { + device_allocations[ptr] = n; + currently_allocated_bytes += n; + peak_allocated_bytes = std::max(peak_allocated_bytes, currently_allocated_bytes); + num_allocations++; + CHECK_GT(num_allocations, num_deallocations); + } + void RegisterDeallocation(void *ptr, size_t n, int current_device) { + auto itr = device_allocations.find(ptr); + if (itr == device_allocations.end()) { + LOG(WARNING) << "Attempting to deallocate " << n << " bytes on device " << current_device + << " that was never allocated\n" + << dmlc::StackTrace(); + } else { + num_deallocations++; + CHECK_LE(num_deallocations, num_allocations); + currently_allocated_bytes -= itr->second; + device_allocations.erase(itr); + } + } + }; + DeviceStats stats_; + std::mutex mutex_; + + public: + void RegisterAllocation(void *ptr, size_t n) { + if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) { + return; + } + std::lock_guard guard(mutex_); + stats_.RegisterAllocation(ptr, n); + } + void RegisterDeallocation(void *ptr, size_t n) { + if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) { + return; + } + std::lock_guard guard(mutex_); + stats_.RegisterDeallocation(ptr, n, cub::CurrentDevice()); + } + size_t PeakMemory() const { return stats_.peak_allocated_bytes; } + size_t CurrentlyAllocatedBytes() const { return stats_.currently_allocated_bytes; } + void Clear() { stats_ = DeviceStats(); } + + void Log() { + if (!xgboost::ConsoleLogger::ShouldLog(xgboost::ConsoleLogger::LV::kDebug)) { + return; + } + std::lock_guard guard(mutex_); + int current_device; + dh::safe_cuda(cudaGetDevice(¤t_device)); + LOG(CONSOLE) << "======== Device " << current_device << " Memory Allocations: " + << " ========"; + LOG(CONSOLE) << "Peak memory usage: " << stats_.peak_allocated_bytes / 1048576 << "MiB"; + LOG(CONSOLE) << "Number of allocations: " << stats_.num_allocations; + } +}; + +void ThrowOOMError(std::string const &err, size_t bytes); +} // namespace detail + +inline detail::MemoryLogger &GlobalMemoryLogger() { + static detail::MemoryLogger memory_logger; + return memory_logger; +} + +namespace detail { +#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 +template +using XGBBaseDeviceAllocator = rmm::mr::thrust_allocator; +#else // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 +template +using XGBBaseDeviceAllocator = thrust::device_malloc_allocator; +#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 + +/** + * \brief Default memory allocator, uses cudaMalloc/Free and logs allocations if verbose. + */ +template +struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator { + using SuperT = XGBBaseDeviceAllocator; + using pointer = thrust::device_ptr; // NOLINT + template + struct rebind // NOLINT + { + using other = XGBDefaultDeviceAllocatorImpl; // NOLINT + }; + pointer allocate(size_t n) { // NOLINT + pointer ptr; + try { + ptr = SuperT::allocate(n); + dh::safe_cuda(cudaGetLastError()); + } catch (const std::exception &e) { + detail::ThrowOOMError(e.what(), n * sizeof(T)); + } + GlobalMemoryLogger().RegisterAllocation(ptr.get(), n * sizeof(T)); + return ptr; + } + void deallocate(pointer ptr, size_t n) { // NOLINT + GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T)); + SuperT::deallocate(ptr, n); + } +#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 + XGBDefaultDeviceAllocatorImpl() + : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()) {} +#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 +}; + +/** + * \brief Caching memory allocator, uses cub::CachingDeviceAllocator as a back-end, unless + * RMM pool allocator is enabled. Does not initialise memory on construction. + */ +template +struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator { + using SuperT = XGBBaseDeviceAllocator; + using pointer = thrust::device_ptr; // NOLINT + template + struct rebind // NOLINT + { + using other = XGBCachingDeviceAllocatorImpl; // NOLINT + }; + cub::CachingDeviceAllocator &GetGlobalCachingAllocator() { + // Configure allocator with maximum cached bin size of ~1GB and no limit on + // maximum cached bytes + thread_local std::unique_ptr allocator{ + std::make_unique(2, 9, 29)}; + return *allocator; + } + pointer allocate(size_t n) { // NOLINT + pointer thrust_ptr; + if (use_cub_allocator_) { + T *raw_ptr{nullptr}; + auto errc = GetGlobalCachingAllocator().DeviceAllocate(reinterpret_cast(&raw_ptr), + n * sizeof(T)); + if (errc != cudaSuccess) { + detail::ThrowOOMError("Caching allocator", n * sizeof(T)); + } + thrust_ptr = pointer(raw_ptr); + } else { + try { + thrust_ptr = SuperT::allocate(n); + dh::safe_cuda(cudaGetLastError()); + } catch (const std::exception &e) { + detail::ThrowOOMError(e.what(), n * sizeof(T)); + } + } + GlobalMemoryLogger().RegisterAllocation(thrust_ptr.get(), n * sizeof(T)); + return thrust_ptr; + } + void deallocate(pointer ptr, size_t n) { // NOLINT + GlobalMemoryLogger().RegisterDeallocation(ptr.get(), n * sizeof(T)); + if (use_cub_allocator_) { + GetGlobalCachingAllocator().DeviceFree(ptr.get()); + } else { + SuperT::deallocate(ptr, n); + } + } +#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 + XGBCachingDeviceAllocatorImpl() + : SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()), + use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {} +#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1 + XGBOOST_DEVICE void construct(T *) {} // NOLINT + private: + bool use_cub_allocator_{true}; +}; +} // namespace detail + +// Declare xgboost allocators +// Replacement of allocator with custom backend should occur here +template +using XGBDeviceAllocator = detail::XGBDefaultDeviceAllocatorImpl; + +/** Be careful that the initialization constructor is a no-op, which means calling + * `vec.resize(n)` won't initialize the memory region to 0. Instead use + * `vec.resize(n, 0)` + */ +template +using XGBCachingDeviceAllocator = detail::XGBCachingDeviceAllocatorImpl; + +/** @brief Specialisation of thrust device vector using custom allocator. */ +template +using device_vector = thrust::device_vector>; // NOLINT +template +using caching_device_vector = thrust::device_vector>; // NOLINT + +#if defined(XGBOOST_USE_RMM) +/** + * @brief Similar to `rmm::logging_resource_adaptor`, but uses XGBoost memory logger instead. + */ +class LoggingResource : public rmm::mr::device_memory_resource { + rmm::mr::device_memory_resource *mr_{rmm::mr::get_current_device_resource()}; + + public: + LoggingResource() = default; + ~LoggingResource() override = default; + LoggingResource(LoggingResource const &) = delete; + LoggingResource &operator=(LoggingResource const &) = delete; + LoggingResource(LoggingResource &&) noexcept = default; + LoggingResource &operator=(LoggingResource &&) noexcept = default; + + [[nodiscard]] rmm::device_async_resource_ref get_upstream_resource() const noexcept { // NOLINT + return mr_; + } + [[nodiscard]] rmm::mr::device_memory_resource *get_upstream() const noexcept { // NOLINT + return mr_; + } + + void *do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override { // NOLINT + try { + auto const ptr = mr_->allocate(bytes, stream); + GlobalMemoryLogger().RegisterAllocation(ptr, bytes); + return ptr; + } catch (rmm::bad_alloc const &e) { + detail::ThrowOOMError(e.what(), bytes); + } + return nullptr; + } + + void do_deallocate(void *ptr, std::size_t bytes, // NOLINT + rmm::cuda_stream_view stream) override { + mr_->deallocate(ptr, bytes, stream); + GlobalMemoryLogger().RegisterDeallocation(ptr, bytes); + } + + [[nodiscard]] bool do_is_equal( // NOLINT + device_memory_resource const &other) const noexcept override { + if (this == &other) { + return true; + } + auto const *cast = dynamic_cast(&other); + if (cast == nullptr) { + return mr_->is_equal(other); + } + return get_upstream_resource() == cast->get_upstream_resource(); + } +}; + +LoggingResource *GlobalLoggingResource(); + +/** + * @brief Container class that doesn't initialize the data. + */ +template +class DeviceUVector : public rmm::device_uvector { + using Super = rmm::device_uvector; + + public: + DeviceUVector() : Super{0, rmm::cuda_stream_per_thread, GlobalLoggingResource()} {} + + void Resize(std::size_t n) { Super::resize(n, rmm::cuda_stream_per_thread); } + void Resize(std::size_t n, T const &v) { + auto orig = this->size(); + Super::resize(n, rmm::cuda_stream_per_thread); + if (orig < n) { + thrust::fill(rmm::exec_policy_nosync{}, this->begin() + orig, this->end(), v); + } + } + + private: + // undefined private, cannot be accessed. + void resize(std::size_t n, rmm::cuda_stream_view stream); // NOLINT +}; + +#else + +/** + * @brief Without RMM, the initialization will happen. + */ +template +class DeviceUVector : public thrust::device_vector> { + using Super = thrust::device_vector>; + + public: + void Resize(std::size_t n) { Super::resize(n); } + void Resize(std::size_t n, T const &v) { Super::resize(n, v); } + + private: + // undefined private, cannot be accessed. + void resize(std::size_t n, T const &v = T{}); // NOLINT +}; + +#endif // defined(XGBOOST_USE_RMM) +} // namespace dh diff --git a/src/common/host_device_vector.cc b/src/common/host_device_vector.cc index f4973c0428f0..de9e0614a38e 100644 --- a/src/common/host_device_vector.cc +++ b/src/common/host_device_vector.cc @@ -114,6 +114,11 @@ void HostDeviceVector::Resize(size_t new_size, T v) { impl_->Vec().resize(new_size, v); } +template +void HostDeviceVector::Resize(size_t new_size) { + impl_->Vec().resize(new_size, T{}); +} + template void HostDeviceVector::Fill(T v) { std::fill(HostVector().begin(), HostVector().end(), v); diff --git a/src/common/host_device_vector.cu b/src/common/host_device_vector.cu index 99448df21b7e..16a1aa027f09 100644 --- a/src/common/host_device_vector.cu +++ b/src/common/host_device_vector.cu @@ -1,16 +1,17 @@ /** - * Copyright 2017-2023 by XGBoost contributors + * Copyright 2017-2024, XGBoost contributors */ #include -#include #include +#include // for size_t #include +#include "device_helpers.cuh" +#include "device_vector.cuh" // for DeviceUVector #include "xgboost/data.h" #include "xgboost/host_device_vector.h" -#include "xgboost/tree_model.h" -#include "device_helpers.cuh" +#include "xgboost/tree_model.h" // for RegTree namespace xgboost { @@ -28,7 +29,7 @@ class HostDeviceVectorImpl { if (device.IsCUDA()) { gpu_access_ = GPUAccess::kWrite; SetDevice(); - data_d_->resize(size, v); + data_d_->Resize(size, v); } else { data_h_.resize(size, v); } @@ -66,22 +67,22 @@ class HostDeviceVectorImpl { T* DevicePointer() { LazySyncDevice(GPUAccess::kWrite); - return data_d_->data().get(); + return thrust::raw_pointer_cast(data_d_->data()); } const T* ConstDevicePointer() { LazySyncDevice(GPUAccess::kRead); - return data_d_->data().get(); + return thrust::raw_pointer_cast(data_d_->data()); } common::Span DeviceSpan() { LazySyncDevice(GPUAccess::kWrite); - return {data_d_->data().get(), Size()}; + return {this->DevicePointer(), Size()}; } common::Span ConstDeviceSpan() { LazySyncDevice(GPUAccess::kRead); - return {data_d_->data().get(), Size()}; + return {this->ConstDevicePointer(), Size()}; } void Fill(T v) { // NOLINT @@ -91,7 +92,7 @@ class HostDeviceVectorImpl { gpu_access_ = GPUAccess::kWrite; SetDevice(); auto s_data = dh::ToSpan(*data_d_); - dh::LaunchN(data_d_->size(), + dh::LaunchN(data_d_->size(), dh::DefaultStream(), [=] XGBOOST_DEVICE(size_t i) { s_data[i] = v; }); } } @@ -128,7 +129,7 @@ class HostDeviceVectorImpl { void Extend(HostDeviceVectorImpl* other) { auto ori_size = this->Size(); - this->Resize(ori_size + other->Size(), T()); + this->Resize(ori_size + other->Size(), T{}); if (HostCanWrite() && other->HostCanRead()) { auto& h_vec = this->HostVector(); auto& other_vec = other->HostVector(); @@ -138,10 +139,9 @@ class HostDeviceVectorImpl { auto ptr = other->ConstDevicePointer(); SetDevice(); CHECK_EQ(this->Device(), other->Device()); - dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size, - ptr, - other->Size() * sizeof(T), - cudaMemcpyDeviceToDevice)); + dh::safe_cuda(cudaMemcpyAsync(this->DevicePointer() + ori_size, ptr, + other->Size() * sizeof(T), cudaMemcpyDeviceToDevice, + dh::DefaultStream())); } } @@ -171,17 +171,22 @@ class HostDeviceVectorImpl { } } - void Resize(size_t new_size, T v) { - if (new_size == Size()) { return; } + template + auto Resize(std::size_t new_size, U&&... args) { + if (new_size == Size()) { + return; + } if ((Size() == 0 && device_.IsCUDA()) || (DeviceCanWrite() && device_.IsCUDA())) { // fast on-device resize gpu_access_ = GPUAccess::kWrite; SetDevice(); - data_d_->resize(new_size, v); + auto old_size = data_d_->size(); + data_d_->Resize(new_size, std::forward(args)...); } else { // resize on host LazySyncHost(GPUAccess::kNone); - data_h_.resize(new_size, v); + auto old_size = data_h_.size(); + data_h_.resize(new_size, std::forward(args)...); } } @@ -195,10 +200,8 @@ class HostDeviceVectorImpl { gpu_access_ = access; if (data_h_.size() != data_d_->size()) { data_h_.resize(data_d_->size()); } SetDevice(); - dh::safe_cuda(cudaMemcpy(data_h_.data(), - data_d_->data().get(), - data_d_->size() * sizeof(T), - cudaMemcpyDeviceToHost)); + dh::safe_cuda(cudaMemcpy(data_h_.data(), thrust::raw_pointer_cast(data_d_->data()), + data_d_->size() * sizeof(T), cudaMemcpyDeviceToHost)); } void LazySyncDevice(GPUAccess access) { @@ -211,10 +214,9 @@ class HostDeviceVectorImpl { // data is on the host LazyResizeDevice(data_h_.size()); SetDevice(); - dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), - data_h_.data(), - data_d_->size() * sizeof(T), - cudaMemcpyHostToDevice)); + dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), data_h_.data(), + data_d_->size() * sizeof(T), cudaMemcpyHostToDevice, + dh::DefaultStream())); gpu_access_ = access; } @@ -229,7 +231,7 @@ class HostDeviceVectorImpl { private: DeviceOrd device_{DeviceOrd::CPU()}; std::vector data_h_{}; - std::unique_ptr> data_d_{}; + std::unique_ptr> data_d_{}; GPUAccess gpu_access_{GPUAccess::kNone}; void CopyToDevice(HostDeviceVectorImpl* other) { @@ -239,8 +241,10 @@ class HostDeviceVectorImpl { LazyResizeDevice(Size()); gpu_access_ = GPUAccess::kWrite; SetDevice(); - dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), other->data_d_->data().get(), - data_d_->size() * sizeof(T), cudaMemcpyDefault)); + dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), + thrust::raw_pointer_cast(other->data_d_->data()), + data_d_->size() * sizeof(T), cudaMemcpyDefault, + dh::DefaultStream())); } } @@ -248,14 +252,15 @@ class HostDeviceVectorImpl { LazyResizeDevice(Size()); gpu_access_ = GPUAccess::kWrite; SetDevice(); - dh::safe_cuda(cudaMemcpyAsync(data_d_->data().get(), begin, - data_d_->size() * sizeof(T), cudaMemcpyDefault)); + dh::safe_cuda(cudaMemcpyAsync(thrust::raw_pointer_cast(data_d_->data()), begin, + data_d_->size() * sizeof(T), cudaMemcpyDefault, + dh::DefaultStream())); } void LazyResizeDevice(size_t new_size) { if (data_d_ && new_size == data_d_->size()) { return; } SetDevice(); - data_d_->resize(new_size); + data_d_->Resize(new_size); } void SetDevice() { @@ -267,7 +272,7 @@ class HostDeviceVectorImpl { } if (!data_d_) { - data_d_.reset(new dh::device_vector); + data_d_.reset(new dh::DeviceUVector{}); } } }; @@ -397,7 +402,12 @@ void HostDeviceVector::SetDevice(DeviceOrd device) const { } template -void HostDeviceVector::Resize(size_t new_size, T v) { +void HostDeviceVector::Resize(std::size_t new_size) { + impl_->Resize(new_size); +} + +template +void HostDeviceVector::Resize(std::size_t new_size, T v) { impl_->Resize(new_size, v); } @@ -427,5 +437,4 @@ template class HostDeviceVector; */ template class HostDeviceVector; #endif // defined(__APPLE__) - } // namespace xgboost diff --git a/src/common/quantile.cuh b/src/common/quantile.cuh index 898da03a0dce..3dd393755852 100644 --- a/src/common/quantile.cuh +++ b/src/common/quantile.cuh @@ -4,12 +4,14 @@ #ifndef XGBOOST_COMMON_QUANTILE_CUH_ #define XGBOOST_COMMON_QUANTILE_CUH_ -#include "xgboost/span.h" -#include "xgboost/data.h" +#include // for any_of + +#include "categorical.h" #include "device_helpers.cuh" #include "quantile.h" #include "timer.h" -#include "categorical.h" +#include "xgboost/data.h" +#include "xgboost/span.h" namespace xgboost { namespace common { @@ -100,9 +102,9 @@ class SketchContainer { CHECK(device.IsCUDA()); // Initialize Sketches for this dmatrix this->columns_ptr_.SetDevice(device_); - this->columns_ptr_.Resize(num_columns + 1); + this->columns_ptr_.Resize(num_columns + 1, 0); this->columns_ptr_b_.SetDevice(device_); - this->columns_ptr_b_.Resize(num_columns + 1); + this->columns_ptr_b_.Resize(num_columns + 1, 0); this->feature_types_.Resize(feature_types.Size()); this->feature_types_.Copy(feature_types); diff --git a/src/metric/auc.cu b/src/metric/auc.cu index 59199b092839..4155a7084481 100644 --- a/src/metric/auc.cu +++ b/src/metric/auc.cu @@ -1,7 +1,8 @@ /** * Copyright 2021-2024, XGBoost Contributors */ -#include // for copy +#include // for copy +#include // for any_of #include #include diff --git a/src/tree/common_row_partitioner.h b/src/tree/common_row_partitioner.h index cd267673b66c..ff75000dfd45 100644 --- a/src/tree/common_row_partitioner.h +++ b/src/tree/common_row_partitioner.h @@ -36,10 +36,11 @@ class ColumnSplitHelper { common::PartitionBuilder* partition_builder, common::RowSetCollection* row_set_collection) : partition_builder_{partition_builder}, row_set_collection_{row_set_collection} { - decision_storage_.resize(num_row); - decision_bits_ = BitVector(common::Span(decision_storage_)); - missing_storage_.resize(num_row); - missing_bits_ = BitVector(common::Span(missing_storage_)); + auto n_bytes = BitVector::ComputeStorageSize(num_row); + decision_storage_.resize(n_bytes); + decision_bits_ = BitVector{common::Span{decision_storage_}}; + missing_storage_.resize(n_bytes); + missing_bits_ = BitVector{common::Span{missing_storage_}}; } template @@ -51,14 +52,43 @@ class ColumnSplitHelper { // we first collect all the decisions and whether the feature is missing into bit vectors. std::fill(decision_storage_.begin(), decision_storage_.end(), 0); std::fill(missing_storage_.begin(), missing_storage_.end(), 0); - common::ParallelFor2d(space, n_threads, [&](size_t node_in_set, common::Range1d r) { - const int32_t nid = nodes[node_in_set].nid; + + this->tloc_decision_.resize(decision_storage_.size() * n_threads); + this->tloc_missing_.resize(decision_storage_.size() * n_threads); + std::fill_n(this->tloc_decision_.data(), this->tloc_decision_.size(), 0); + std::fill_n(this->tloc_missing_.data(), this->tloc_missing_.size(), 0); + + // Make thread-local storage. + using T = decltype(decision_storage_)::value_type; + auto make_tloc = [&](std::vector& storage, std::int32_t tidx) { + auto span = common::Span{storage}; + auto n = decision_storage_.size(); + auto bitvec = BitVector{span.subspan(n * tidx, n)}; + return bitvec; + }; + + common::ParallelFor2d(space, n_threads, [&](std::size_t node_in_set, common::Range1d r) { + bst_node_t const nid = nodes[node_in_set].nid; + auto tidx = omp_get_thread_num(); + auto decision = make_tloc(this->tloc_decision_, tidx); + auto missing = make_tloc(this->tloc_missing_, tidx); bst_bin_t split_cond = column_matrix.IsInitialized() ? split_conditions[node_in_set] : 0; partition_builder_->MaskRows( node_in_set, nodes, r, split_cond, gmat, column_matrix, *p_tree, - (*row_set_collection_)[nid].begin(), &decision_bits_, &missing_bits_); + (*row_set_collection_)[nid].begin(), &decision, &missing); }); + // Reduce thread local + auto decision = make_tloc(this->tloc_decision_, 0); + auto missing = make_tloc(this->tloc_missing_, 0); + for (std::int32_t tidx = 1; tidx < n_threads; ++tidx) { + decision |= make_tloc(this->tloc_decision_, tidx); + missing |= make_tloc(this->tloc_missing_, tidx); + } + CHECK_EQ(decision_storage_.size(), decision.NumValues()); + std::copy_n(decision.Data(), decision_storage_.size(), decision_storage_.data()); + std::copy_n(missing.Data(), missing_storage_.size(), missing_storage_.data()); + // Then aggregate the bit vectors across all the workers. auto rc = collective::Success() << [&] { return collective::Allreduce(ctx, &decision_storage_, collective::Op::kBitwiseOR); @@ -85,6 +115,10 @@ class ColumnSplitHelper { BitVector decision_bits_{}; std::vector missing_storage_{}; BitVector missing_bits_{}; + + std::vector tloc_decision_; + std::vector tloc_missing_; + common::PartitionBuilder* partition_builder_; common::RowSetCollection* row_set_collection_; }; diff --git a/src/tree/updater_gpu_hist.cu b/src/tree/updater_gpu_hist.cu index 366cf3aad08e..5278b328acbc 100644 --- a/src/tree/updater_gpu_hist.cu +++ b/src/tree/updater_gpu_hist.cu @@ -841,9 +841,7 @@ class GPUHistMaker : public TreeUpdater { out["hist_train_param"] = ToJson(hist_maker_param_); } - ~GPUHistMaker() { // NOLINT - dh::GlobalMemoryLogger().Log(); - } + ~GPUHistMaker() override { dh::GlobalMemoryLogger().Log(); } void Update(TrainParam const* param, linalg::Matrix* gpair, DMatrix* dmat, common::Span> out_position, diff --git a/tests/ci_build/conda_env/linux_sycl_test.yml b/tests/ci_build/conda_env/linux_sycl_test.yml index 7335b7f20fd5..e82a6bed62f5 100644 --- a/tests/ci_build/conda_env/linux_sycl_test.yml +++ b/tests/ci_build/conda_env/linux_sycl_test.yml @@ -1,7 +1,7 @@ name: linux_sycl_test channels: - conda-forge -- intel +- https://software.repos.intel.com/python/conda/ dependencies: - python=3.8 - cmake diff --git a/tests/cpp/common/test_device_vector.cu b/tests/cpp/common/test_device_vector.cu new file mode 100644 index 000000000000..95da4ef3f167 --- /dev/null +++ b/tests/cpp/common/test_device_vector.cu @@ -0,0 +1,21 @@ +/** + * Copyright 2024, XGBoost Contributors + */ +#include + +#include "../../../src/common/device_vector.cuh" +#include "xgboost/global_config.h" // for GlobalConfigThreadLocalStore + +namespace dh { +TEST(DeviceUVector, Basic) { + GlobalMemoryLogger().Clear(); + std::int32_t verbosity{3}; + std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity); + DeviceUVector uvec; + uvec.Resize(12); + auto peak = GlobalMemoryLogger().PeakMemory(); + auto n_bytes = sizeof(decltype(uvec)::value_type) * uvec.size(); + ASSERT_EQ(peak, n_bytes); + std::swap(verbosity, xgboost::GlobalConfigThreadLocalStore::Get()->verbosity); +} +} // namespace dh diff --git a/tests/cpp/common/test_host_device_vector.cu b/tests/cpp/common/test_host_device_vector.cu index 57e945cba9be..a0aa5fa11fce 100644 --- a/tests/cpp/common/test_host_device_vector.cu +++ b/tests/cpp/common/test_host_device_vector.cu @@ -1,5 +1,5 @@ /** - * Copyright 2018-2023 XGBoost contributors + * Copyright 2018-2024, XGBoost contributors */ #include #include @@ -181,4 +181,41 @@ TEST(HostDeviceVector, Empty) { ASSERT_FALSE(another.Empty()); ASSERT_TRUE(vec.Empty()); } + +TEST(HostDeviceVector, Resize) { + auto check = [&](HostDeviceVector const& vec) { + auto const& h_vec = vec.ConstHostSpan(); + for (std::size_t i = 0; i < 4; ++i) { + ASSERT_EQ(h_vec[i], i + 1); + } + for (std::size_t i = 4; i < vec.Size(); ++i) { + ASSERT_EQ(h_vec[i], 3.0); + } + }; + { + HostDeviceVector vec{1.0f, 2.0f, 3.0f, 4.0f}; + vec.SetDevice(DeviceOrd::CUDA(0)); + vec.ConstDeviceSpan(); + ASSERT_TRUE(vec.DeviceCanRead()); + ASSERT_FALSE(vec.DeviceCanWrite()); + vec.DeviceSpan(); + vec.Resize(7, 3.0f); + ASSERT_TRUE(vec.DeviceCanWrite()); + check(vec); + } + { + HostDeviceVector vec{{1.0f, 2.0f, 3.0f, 4.0f}, DeviceOrd::CUDA(0)}; + ASSERT_TRUE(vec.DeviceCanWrite()); + vec.Resize(7, 3.0f); + ASSERT_TRUE(vec.DeviceCanWrite()); + check(vec); + } + { + HostDeviceVector vec{1.0f, 2.0f, 3.0f, 4.0f}; + ASSERT_TRUE(vec.HostCanWrite()); + vec.Resize(7, 3.0f); + ASSERT_TRUE(vec.HostCanWrite()); + check(vec); + } +} } // namespace xgboost::common diff --git a/tests/cpp/data/test_array_interface.h b/tests/cpp/data/test_array_interface.h index 78bce76f53e7..dfe4f5a3ec5c 100644 --- a/tests/cpp/data/test_array_interface.h +++ b/tests/cpp/data/test_array_interface.h @@ -1,15 +1,14 @@ -// Copyright (c) 2019 by Contributors +/** + * Copyright 2019-2024, XGBoost Contributors + */ #include +#include +#include // for device +#include // for sequence #include #include -#include - -#include -#include "../../../src/common/bitfield.h" -#include "../../../src/common/device_helpers.cuh" namespace xgboost { - template Json GenerateDenseColumn(std::string const& typestr, size_t kRows, thrust::device_vector* out_d_data) { diff --git a/tests/cpp/tree/test_approx.cc b/tests/cpp/tree/test_approx.cc index d647d3a970bf..8f28bfa218c8 100644 --- a/tests/cpp/tree/test_approx.cc +++ b/tests/cpp/tree/test_approx.cc @@ -6,6 +6,7 @@ #include "../../../src/tree/common_row_partitioner.h" #include "../collective/test_worker.h" // for TestDistributedGlobal #include "../helpers.h" +#include "test_column_split.h" // for TestColumnSplit #include "test_partitioner.h" namespace xgboost::tree { @@ -154,4 +155,26 @@ TEST(Approx, PartitionerColSplit) { mid_partitioner); }); } + +namespace { +class TestApproxColSplit : public ::testing::TestWithParam> { + public: + void Run() { + auto [categorical, sparsity] = GetParam(); + TestColumnSplit(1u, categorical, "grow_histmaker", sparsity); + } +}; +} // namespace + +TEST_P(TestApproxColSplit, Basic) { this->Run(); } + +INSTANTIATE_TEST_SUITE_P(ColumnSplit, TestApproxColSplit, ::testing::ValuesIn([]() { + std::vector> params; + for (auto categorical : {true, false}) { + for (auto sparsity : {0.0f, 0.6f}) { + params.emplace_back(categorical, sparsity); + } + } + return params; + }())); } // namespace xgboost::tree diff --git a/tests/cpp/tree/test_column_split.h b/tests/cpp/tree/test_column_split.h new file mode 100644 index 000000000000..b03597f38681 --- /dev/null +++ b/tests/cpp/tree/test_column_split.h @@ -0,0 +1,79 @@ +/** + * Copyright 2023-2024, XGBoost Contributors + */ +#pragma once + +#include // for FeatureType, DMatrix +#include // for RegTree +#include // for TreeUpdater + +#include // for size_t +#include // for shared_ptr +#include // for vector + +#include "../../../src/tree/param.h" // for TrainParam +#include "../collective/test_worker.h" // for TestDistributedGlobal +#include "../helpers.h" // for RandomDataGenerator + +namespace xgboost::tree { +inline std::shared_ptr GenerateCatDMatrix(std::size_t rows, std::size_t cols, + float sparsity, bool categorical) { + if (categorical) { + std::vector ft(cols); + for (size_t i = 0; i < ft.size(); ++i) { + ft[i] = (i % 3 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical; + } + return RandomDataGenerator(rows, cols, 0.6f).Seed(3).Type(ft).MaxCategory(17).GenerateDMatrix(); + } else { + return RandomDataGenerator{rows, cols, 0.6f}.Seed(3).GenerateDMatrix(); + } +} + +inline void TestColumnSplit(bst_target_t n_targets, bool categorical, std::string name, + float sparsity) { + auto constexpr kRows = 32; + auto constexpr kCols = 16; + + RegTree expected_tree{n_targets, static_cast(kCols)}; + ObjInfo task{ObjInfo::kRegression}; + Context ctx; + { + auto p_dmat = GenerateCatDMatrix(kRows, kCols, sparsity, categorical); + auto gpair = GenerateRandomGradients(&ctx, kRows, n_targets); + std::unique_ptr updater{TreeUpdater::Create(name, &ctx, &task)}; + std::vector> position(1); + TrainParam param; + param.Init(Args{}); + updater->Configure(Args{}); + updater->Update(¶m, &gpair, p_dmat.get(), position, {&expected_tree}); + } + + auto verify = [&] { + Context ctx; + auto p_dmat = GenerateCatDMatrix(kRows, kCols, sparsity, categorical); + auto gpair = GenerateRandomGradients(&ctx, kRows, n_targets); + + ObjInfo task{ObjInfo::kRegression}; + std::unique_ptr updater{TreeUpdater::Create(name, &ctx, &task)}; + std::vector> position(1); + + std::unique_ptr sliced{ + p_dmat->SliceCol(collective::GetWorldSize(), collective::GetRank())}; + + RegTree tree{n_targets, static_cast(kCols)}; + TrainParam param; + param.Init(Args{}); + updater->Configure(Args{}); + updater->Update(¶m, &gpair, sliced.get(), position, {&tree}); + + Json json{Object{}}; + tree.SaveModel(&json); + Json expected_json{Object{}}; + expected_tree.SaveModel(&expected_json); + ASSERT_EQ(json, expected_json); + }; + + auto constexpr kWorldSize = 2; + collective::TestDistributedGlobal(kWorldSize, [&] { verify(); }); +} +} // namespace xgboost::tree diff --git a/tests/cpp/tree/test_histmaker.cc b/tests/cpp/tree/test_histmaker.cc index b8b9e46cac18..888790aa7c3c 100644 --- a/tests/cpp/tree/test_histmaker.cc +++ b/tests/cpp/tree/test_histmaker.cc @@ -1,32 +1,19 @@ /** - * Copyright 2019-2023 by XGBoost Contributors + * Copyright 2019-2024, XGBoost Contributors */ #include #include #include -#include "../../../src/tree/param.h" // for TrainParam -#include "../collective/test_worker.h" // for TestDistributedGlobal +#include "../../../src/tree/param.h" // for TrainParam #include "../helpers.h" +#include "test_column_split.h" // for GenerateCatDMatrix namespace xgboost::tree { -std::shared_ptr GenerateDMatrix(std::size_t rows, std::size_t cols, - bool categorical = false) { - if (categorical) { - std::vector ft(cols); - for (size_t i = 0; i < ft.size(); ++i) { - ft[i] = (i % 3 == 0) ? FeatureType::kNumerical : FeatureType::kCategorical; - } - return RandomDataGenerator(rows, cols, 0.6f).Seed(3).Type(ft).MaxCategory(17).GenerateDMatrix(); - } else { - return RandomDataGenerator{rows, cols, 0.6f}.Seed(3).GenerateDMatrix(); - } -} - TEST(GrowHistMaker, InteractionConstraint) { auto constexpr kRows = 32; auto constexpr kCols = 16; - auto p_dmat = GenerateDMatrix(kRows, kCols); + auto p_dmat = GenerateCatDMatrix(kRows, kCols, 0.0, false); Context ctx; linalg::Matrix gpair({kRows}, ctx.Device()); @@ -69,62 +56,4 @@ TEST(GrowHistMaker, InteractionConstraint) { ASSERT_NE(tree[tree[0].RightChild()].SplitIndex(), 0); } } - -namespace { -void VerifyColumnSplit(int32_t rows, bst_feature_t cols, bool categorical, - RegTree const& expected_tree) { - Context ctx; - auto p_dmat = GenerateDMatrix(rows, cols, categorical); - linalg::Matrix gpair({rows}, ctx.Device()); - gpair.Data()->Copy(GenerateRandomGradients(rows)); - - - ObjInfo task{ObjInfo::kRegression}; - std::unique_ptr updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)}; - std::vector> position(1); - - std::unique_ptr sliced{ - p_dmat->SliceCol(collective::GetWorldSize(), collective::GetRank())}; - - RegTree tree{1u, cols}; - TrainParam param; - param.Init(Args{}); - updater->Configure(Args{}); - updater->Update(¶m, &gpair, sliced.get(), position, {&tree}); - - Json json{Object{}}; - tree.SaveModel(&json); - Json expected_json{Object{}}; - expected_tree.SaveModel(&expected_json); - ASSERT_EQ(json, expected_json); -} - -void TestColumnSplit(bool categorical) { - auto constexpr kRows = 32; - auto constexpr kCols = 16; - - RegTree expected_tree{1u, kCols}; - ObjInfo task{ObjInfo::kRegression}; - { - Context ctx; - auto p_dmat = GenerateDMatrix(kRows, kCols, categorical); - linalg::Matrix gpair({kRows}, ctx.Device()); - gpair.Data()->Copy(GenerateRandomGradients(kRows)); - std::unique_ptr updater{TreeUpdater::Create("grow_histmaker", &ctx, &task)}; - std::vector> position(1); - TrainParam param; - param.Init(Args{}); - updater->Configure(Args{}); - updater->Update(¶m, &gpair, p_dmat.get(), position, {&expected_tree}); - } - - auto constexpr kWorldSize = 2; - collective::TestDistributedGlobal( - kWorldSize, [&] { VerifyColumnSplit(kRows, kCols, categorical, expected_tree); }); -} -} // anonymous namespace - -TEST(GrowHistMaker, ColumnSplitNumerical) { TestColumnSplit(false); } - -TEST(GrowHistMaker, ColumnSplitCategorical) { TestColumnSplit(true); } } // namespace xgboost::tree diff --git a/tests/cpp/tree/test_quantile_hist.cc b/tests/cpp/tree/test_quantile_hist.cc index 29ae02f8d2b2..74fd6ec5ff79 100644 --- a/tests/cpp/tree/test_quantile_hist.cc +++ b/tests/cpp/tree/test_quantile_hist.cc @@ -11,9 +11,9 @@ #include "../../../src/tree/common_row_partitioner.h" #include "../../../src/tree/hist/expand_entry.h" // for MultiExpandEntry, CPUExpandEntry -#include "../../../src/tree/param.h" #include "../collective/test_worker.h" // for TestDistributedGlobal #include "../helpers.h" +#include "test_column_split.h" // for TestColumnSplit #include "test_partitioner.h" #include "xgboost/data.h" @@ -208,57 +208,26 @@ TEST(QuantileHist, PartitionerColSplit) { TestColumnSplitPartitioner(3); } namespace { -void VerifyColumnSplit(Context const* ctx, bst_idx_t rows, bst_feature_t cols, bst_target_t n_targets, - RegTree const& expected_tree) { - auto Xy = RandomDataGenerator{rows, cols, 0}.GenerateDMatrix(true); - linalg::Matrix gpair = GenerateRandomGradients(ctx, rows, n_targets); - - ObjInfo task{ObjInfo::kRegression}; - std::unique_ptr updater{TreeUpdater::Create("grow_quantile_histmaker", ctx, &task)}; - std::vector> position(1); - - std::unique_ptr sliced{Xy->SliceCol(collective::GetWorldSize(), collective::GetRank())}; - - RegTree tree{n_targets, cols}; - TrainParam param; - param.Init(Args{}); - updater->Configure(Args{}); - updater->Update(¶m, &gpair, sliced.get(), position, {&tree}); - - Json json{Object{}}; - tree.SaveModel(&json); - Json expected_json{Object{}}; - expected_tree.SaveModel(&expected_json); - ASSERT_EQ(json, expected_json); -} - -void TestColumnSplit(bst_target_t n_targets) { - auto constexpr kRows = 32; - auto constexpr kCols = 16; - - RegTree expected_tree{n_targets, kCols}; - ObjInfo task{ObjInfo::kRegression}; - Context ctx; - { - auto Xy = RandomDataGenerator{kRows, kCols, 0}.GenerateDMatrix(true); - auto gpair = GenerateRandomGradients(&ctx, kRows, n_targets); - std::unique_ptr updater{ - TreeUpdater::Create("grow_quantile_histmaker", &ctx, &task)}; - std::vector> position(1); - TrainParam param; - param.Init(Args{}); - updater->Configure(Args{}); - updater->Update(¶m, &gpair, Xy.get(), position, {&expected_tree}); +class TestHistColSplit : public ::testing::TestWithParam> { + public: + void Run() { + auto [n_targets, categorical, sparsity] = GetParam(); + TestColumnSplit(n_targets, categorical, "grow_quantile_histmaker", sparsity); } - - auto constexpr kWorldSize = 2; - collective::TestDistributedGlobal(kWorldSize, [&] { - VerifyColumnSplit(&ctx, kRows, kCols, n_targets, std::cref(expected_tree)); - }); -} +}; } // anonymous namespace -TEST(QuantileHist, ColumnSplit) { TestColumnSplit(1); } - -TEST(QuantileHist, ColumnSplitMultiTarget) { TestColumnSplit(3); } +TEST_P(TestHistColSplit, Basic) { this->Run(); } + +INSTANTIATE_TEST_SUITE_P(ColumnSplit, TestHistColSplit, ::testing::ValuesIn([]() { + std::vector> params; + for (auto categorical : {true, false}) { + for (auto sparsity : {0.0f, 0.6f}) { + for (bst_target_t n_targets : {1u, 3u}) { + params.emplace_back(n_targets, categorical, sparsity); + } + } + } + return params; + }())); } // namespace xgboost::tree