Skip to content

Commit

Permalink
Merge branch 'altrep' into remove_ntree
Browse files Browse the repository at this point in the history
  • Loading branch information
david-cortes committed Jan 3, 2024
2 parents e57a92f + 1a3d9f7 commit 80ce18b
Show file tree
Hide file tree
Showing 45 changed files with 616 additions and 179 deletions.
5 changes: 5 additions & 0 deletions R-package/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ export(xgb.cv)
export(xgb.dump)
export(xgb.gblinear.history)
export(xgb.get.Booster.nrounds)
export(xgb.get.DMatrix.data)
export(xgb.get.DMatrix.num.non.missing)
export(xgb.get.DMatrix.qcut)
export(xgb.get.config)
export(xgb.ggplot.deepness)
export(xgb.ggplot.importance)
Expand All @@ -64,6 +67,7 @@ export(xgb.unserialize)
export(xgboost)
import(methods)
importClassesFrom(Matrix,dgCMatrix)
importClassesFrom(Matrix,dgRMatrix)
importClassesFrom(Matrix,dgeMatrix)
importFrom(Matrix,colSums)
importFrom(Matrix,sparse.model.matrix)
Expand All @@ -87,6 +91,7 @@ importFrom(graphics,points)
importFrom(graphics,title)
importFrom(jsonlite,fromJSON)
importFrom(jsonlite,toJSON)
importFrom(methods,new)
importFrom(stats,coef)
importFrom(stats,median)
importFrom(stats,predict)
Expand Down
3 changes: 2 additions & 1 deletion R-package/R/callbacks.R
Original file line number Diff line number Diff line change
Expand Up @@ -771,7 +771,8 @@ xgb.gblinear.history <- function(model, class_index = NULL) {
if (!is.null(eval_err)) {
if (length(eval_res) != length(eval_err))
stop('eval_res & eval_err lengths mismatch')
res <- paste0(sprintf("%s:%f+%f", enames, eval_res, eval_err), collapse = '\t')
# Note: UTF-8 code for plus/minus sign is U+00B1
res <- paste0(sprintf("%s:%f\U00B1%f", enames, eval_res, eval_err), collapse = '\t')
} else {
res <- paste0(sprintf("%s:%f", enames, eval_res), collapse = '\t')
}
Expand Down
5 changes: 3 additions & 2 deletions R-package/R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -388,8 +388,9 @@ NULL
#'
#' @examples
#' data(agaricus.train, package='xgboost')
#' bst <- xgboost(data = agaricus.train$data, label = agaricus.train$label, max_depth = 2,
#' eta = 1, nthread = 2, nrounds = 2, objective = "binary:logistic")
#' bst <- xgb.train(data = xgb.DMatrix(agaricus.train$data, label = agaricus.train$label),
#' max_depth = 2, eta = 1, nthread = 2, nrounds = 2,
#' objective = "binary:logistic")
#'
#' # Save as a stand-alone file; load it with xgb.load()
#' xgb.save(bst, 'xgb.model')
Expand Down
15 changes: 6 additions & 9 deletions R-package/R/xgb.Booster.R
Original file line number Diff line number Diff line change
Expand Up @@ -185,9 +185,8 @@ xgb.get.handle <- function(object) {
#' train <- agaricus.train
#' test <- agaricus.test
#'
#' bst <- xgboost(
#' data = train$data,
#' label = train$label,
#' bst <- xgb.train(
#' data = xgb.DMatrix(train$data, label = train$label),
#' max_depth = 2,
#' eta = 0.5,
#' nthread = nthread,
Expand Down Expand Up @@ -229,9 +228,8 @@ xgb.get.handle <- function(object) {
#'
#' set.seed(11)
#'
#' bst <- xgboost(
#' data = as.matrix(iris[, -5]),
#' label = lb,
#' bst <- xgb.train(
#' data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb),
#' max_depth = 4,
#' eta = 0.5,
#' nthread = 2,
Expand All @@ -254,9 +252,8 @@ xgb.get.handle <- function(object) {
#' # compare with predictions from softmax:
#' set.seed(11)
#'
#' bst <- xgboost(
#' data = as.matrix(iris[, -5]),
#' label = lb,
#' bst <- xgb.train(
#' data = xgb.DMatrix(as.matrix(iris[, -5]), label = lb),
#' max_depth = 4,
#' eta = 0.5,
#' nthread = 2,
Expand Down
105 changes: 105 additions & 0 deletions R-package/R/xgb.DMatrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -531,6 +531,111 @@ setinfo.xgb.DMatrix <- function(object, name, info) {
stop("setinfo: unknown info name ", name)
}

#' @title Get Quantile Cuts from DMatrix
#' @description Get the quantile cuts (a.k.a. borders) from an `xgb.DMatrix`
#' that has been quantized for the histogram method (`tree_method="hist"`).
#'
#' These cuts are used in order to assign observations to bins - i.e. these are ordered
#' boundaries which are used to determine assignment condition `border_low < x < border_high`.
#' As such, the first and last bin will be outside of the range of the data, so as to include
#' all of the observations there.
#'
#' If a given column has 'n' bins, then there will be 'n+1' cuts / borders for that column,
#' which will be output in sorted order from lowest to highest.
#'
#' Different columns can have different numbers of bins according to their range.
#' @param dmat An `xgb.DMatrix` object, as returned by \link{xgb.DMatrix}.
#' @param output Output format for the quantile cuts. Possible options are:\itemize{
#' \item `"list"` will return the output as a list with one entry per column, where
#' each column will have a numeric vector with the cuts. The list will be named if
#' `dmat` has column names assigned to it.
#' \item `"arrays"` will return a list with entries `indptr` (base-0 indexing) and
#' `data`. Here, the cuts for column 'i' are obtained by slicing 'data' from entries
#' `indptr[i]+1` to `indptr[i+1]`.
#' }
#' @return The quantile cuts, in the format specified by parameter `output`.
#' @examples
#' library(xgboost)
#' data(mtcars)
#' y <- mtcars$mpg
#' x <- as.matrix(mtcars[, -1])
#' dm <- xgb.DMatrix(x, label = y, nthread = 1)
#'
#' # DMatrix is not quantized right away, but will be once a hist model is generated
#' model <- xgb.train(
#' data = dm,
#' params = list(
#' tree_method = "hist",
#' max_bin = 8,
#' nthread = 1
#' ),
#' nrounds = 3
#' )
#'
#' # Now can get the quantile cuts
#' xgb.get.DMatrix.qcut(dm)
#' @export
xgb.get.DMatrix.qcut <- function(dmat, output = c("list", "arrays")) { # nolint
stopifnot(inherits(dmat, "xgb.DMatrix"))
output <- head(output, 1L)
stopifnot(output %in% c("list", "arrays"))
res <- .Call(XGDMatrixGetQuantileCut_R, dmat)
if (output == "arrays") {
return(res)
} else {
feature_names <- getinfo(dmat, "feature_name")
ncols <- length(res$indptr) - 1
out <- lapply(
seq(1, ncols),
function(col) {
st <- res$indptr[col]
end <- res$indptr[col + 1]
if (end <= st) {
return(numeric())
}
return(res$data[seq(1 + st, end)])
}
)
if (NROW(feature_names)) {
names(out) <- feature_names
}
return(out)
}
}

#' @title Get Number of Non-Missing Entries in DMatrix
#' @param dmat An `xgb.DMatrix` object, as returned by \link{xgb.DMatrix}.
#' @return The number of non-missing entries in the DMatrix
#' @export
xgb.get.DMatrix.num.non.missing <- function(dmat) { # nolint
stopifnot(inherits(dmat, "xgb.DMatrix"))
return(.Call(XGDMatrixNumNonMissing_R, dmat))
}

#' @title Get DMatrix Data
#' @param dmat An `xgb.DMatrix` object, as returned by \link{xgb.DMatrix}.
#' @return The data held in the DMatrix, as a sparse CSR matrix (class `dgRMatrix`
#' from package `Matrix`). If it had feature names, these will be added as column names
#' in the output.
#' @export
xgb.get.DMatrix.data <- function(dmat) {
stopifnot(inherits(dmat, "xgb.DMatrix"))
res <- .Call(XGDMatrixGetDataAsCSR_R, dmat)
out <- methods::new("dgRMatrix")
nrows <- as.integer(length(res$indptr) - 1)
out@p <- res$indptr
out@j <- res$indices
out@x <- res$data
out@Dim <- as.integer(c(nrows, res$ncols))

feature_names <- getinfo(dmat, "feature_name")
dim_names <- list(NULL, NULL)
if (NROW(feature_names)) {
dim_names[[2L]] <- feature_names
}
out@Dimnames <- dim_names
return(out)
}

#' Get a new DMatrix containing the specified rows of
#' original xgb.DMatrix object
Expand Down
5 changes: 3 additions & 2 deletions R-package/R/xgb.cv.R
Original file line number Diff line number Diff line change
Expand Up @@ -243,8 +243,9 @@ xgb.cv <- function(params = list(), data, nrounds, nfold, label = NULL, missing
)
})
msg <- simplify2array(msg)
bst_evaluation <- rowMeans(msg)
bst_evaluation_err <- sqrt(rowMeans(msg^2) - bst_evaluation^2) # nolint
# Note: these variables might look unused here, but they are used in the callbacks
bst_evaluation <- rowMeans(msg) # nolint
bst_evaluation_err <- apply(msg, 1, sd) # nolint

for (f in cb$post_iter) f()

Expand Down
6 changes: 4 additions & 2 deletions R-package/R/xgb.load.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,10 @@
#'
#' train <- agaricus.train
#' test <- agaricus.test
#' bst <- xgboost(
#' data = train$data, label = train$label, max_depth = 2, eta = 1,
#' bst <- xgb.train(
#' data = xgb.DMatrix(train$data, label = train$label),
#' max_depth = 2,
#' eta = 1,
#' nthread = nthread,
#' nrounds = 2,
#' objective = "binary:logistic"
Expand Down
6 changes: 4 additions & 2 deletions R-package/R/xgb.save.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,10 @@
#'
#' train <- agaricus.train
#' test <- agaricus.test
#' bst <- xgboost(
#' data = train$data, label = train$label, max_depth = 2, eta = 1,
#' bst <- xgb.train(
#' data = xgb.DMatrix(train$data, label = train$label),
#' max_depth = 2,
#' eta = 1,
#' nthread = nthread,
#' nrounds = 2,
#' objective = "binary:logistic"
Expand Down
4 changes: 2 additions & 2 deletions R-package/R/xgb.save.raw.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@
#'
#' train <- agaricus.train
#' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
#' eta = 1, nthread = nthread, nrounds = 2,objective = "binary:logistic")
#' bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
#' eta = 1, nthread = nthread, nrounds = 2,objective = "binary:logistic")
#'
#' raw <- xgb.save.raw(bst)
#' bst <- xgb.load.raw(raw)
Expand Down
4 changes: 2 additions & 2 deletions R-package/R/xgb.serialize.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@
#' data(agaricus.test, package='xgboost')
#' train <- agaricus.train
#' test <- agaricus.test
#' bst <- xgboost(data = train$data, label = train$label, max_depth = 2,
#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
#' bst <- xgb.train(data = xgb.DMatrix(train$data, label = train$label), max_depth = 2,
#' eta = 1, nthread = 2, nrounds = 2,objective = "binary:logistic")
#' raw <- xgb.serialize(bst)
#' bst <- xgb.unserialize(raw)
#'
Expand Down
16 changes: 8 additions & 8 deletions R-package/R/xgb.train.R
Original file line number Diff line number Diff line change
Expand Up @@ -268,9 +268,9 @@
#' watchlist <- list(train = dtrain, eval = dtest)
#'
#' ## A simple xgb.train example:
#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
#' param <- list(max_depth = 2, eta = 1, nthread = nthread,
#' objective = "binary:logistic", eval_metric = "auc")
#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
#'
#' ## An xgb.train example where custom objective and evaluation metric are
#' ## used:
Expand All @@ -289,13 +289,13 @@
#'
#' # These functions could be used by passing them either:
#' # as 'objective' and 'eval_metric' parameters in the params list:
#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
#' param <- list(max_depth = 2, eta = 1, nthread = nthread,
#' objective = logregobj, eval_metric = evalerror)
#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist)
#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0)
#'
#' # or through the ... arguments:
#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread)
#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
#' param <- list(max_depth = 2, eta = 1, nthread = nthread)
#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
#' objective = logregobj, eval_metric = evalerror)
#'
#' # or as dedicated 'obj' and 'feval' parameters of xgb.train:
Expand All @@ -304,10 +304,10 @@
#'
#'
#' ## An xgb.train example of using variable learning rates at each iteration:
#' param <- list(max_depth = 2, eta = 1, verbose = 0, nthread = nthread,
#' param <- list(max_depth = 2, eta = 1, nthread = nthread,
#' objective = "binary:logistic", eval_metric = "auc")
#' my_etas <- list(eta = c(0.5, 0.1))
#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist,
#' bst <- xgb.train(param, dtrain, nrounds = 2, watchlist, verbose = 0,
#' callbacks = list(cb.reset.parameters(my_etas)))
#'
#' ## Early stopping:
Expand Down
3 changes: 2 additions & 1 deletion R-package/R/xgboost.R
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ NULL
NULL

# Various imports
#' @importClassesFrom Matrix dgCMatrix dgeMatrix
#' @importClassesFrom Matrix dgCMatrix dgeMatrix dgRMatrix
#' @importFrom Matrix colSums
#' @importFrom Matrix sparse.model.matrix
#' @importFrom Matrix sparseVector
Expand All @@ -98,6 +98,7 @@ NULL
#' @importFrom data.table setnames
#' @importFrom jsonlite fromJSON
#' @importFrom jsonlite toJSON
#' @importFrom methods new
#' @importFrom utils object.size str tail
#' @importFrom stats coef
#' @importFrom stats predict
Expand Down
4 changes: 2 additions & 2 deletions R-package/demo/create_sparse_matrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ output_vector <- df[, Y := 0][Improved == "Marked", Y := 1][, Y]

# Following is the same process as other demo
cat("Learning...\n")
bst <- xgboost(data = sparse_matrix, label = output_vector, max_depth = 9,
eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic")
bst <- xgb.train(data = xgb.DMatrix(sparse_matrix, label = output_vector), max_depth = 9,
eta = 1, nthread = 2, nrounds = 10, objective = "binary:logistic")

importance <- xgb.importance(feature_names = colnames(sparse_matrix), model = bst)
print(importance)
Expand Down
18 changes: 9 additions & 9 deletions R-package/demo/interaction_constraints.R
Original file line number Diff line number Diff line change
Expand Up @@ -74,26 +74,26 @@ cols2ids <- function(object, col_names) {
interaction_list_fid <- cols2ids(interaction_list, colnames(train))

# Fit model with interaction constraints
bst <- xgboost(data = train, label = y, max_depth = 4,
eta = 0.1, nthread = 2, nrounds = 1000,
interaction_constraints = interaction_list_fid)
bst <- xgb.train(data = xgb.DMatrix(train, label = y), max_depth = 4,
eta = 0.1, nthread = 2, nrounds = 1000,
interaction_constraints = interaction_list_fid)

bst_tree <- xgb.model.dt.tree(colnames(train), bst)
bst_interactions <- treeInteractions(bst_tree, 4)
# interactions constrained to combinations of V1*V2 and V3*V4*V5

# Fit model without interaction constraints
bst2 <- xgboost(data = train, label = y, max_depth = 4,
eta = 0.1, nthread = 2, nrounds = 1000)
bst2 <- xgb.train(data = xgb.DMatrix(train, label = y), max_depth = 4,
eta = 0.1, nthread = 2, nrounds = 1000)

bst2_tree <- xgb.model.dt.tree(colnames(train), bst2)
bst2_interactions <- treeInteractions(bst2_tree, 4) # much more interactions

# Fit model with both interaction and monotonicity constraints
bst3 <- xgboost(data = train, label = y, max_depth = 4,
eta = 0.1, nthread = 2, nrounds = 1000,
interaction_constraints = interaction_list_fid,
monotone_constraints = c(-1, 0, 0, 0, 0, 0, 0, 0, 0, 0))
bst3 <- xgb.train(data = xgb.DMatrix(train, label = y), max_depth = 4,
eta = 0.1, nthread = 2, nrounds = 1000,
interaction_constraints = interaction_list_fid,
monotone_constraints = c(-1, 0, 0, 0, 0, 0, 0, 0, 0, 0))

bst3_tree <- xgb.model.dt.tree(colnames(train), bst3)
bst3_interactions <- treeInteractions(bst3_tree, 4)
Expand Down
4 changes: 2 additions & 2 deletions R-package/demo/poisson_regression.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
data(mtcars)
head(mtcars)
bst <- xgboost(data = as.matrix(mtcars[, -11]), label = mtcars[, 11],
objective = 'count:poisson', nrounds = 5)
bst <- xgb.train(data = xgb.DMatrix(as.matrix(mtcars[, -11]), label = mtcars[, 11]),
objective = 'count:poisson', nrounds = 5)
pred <- predict(bst, as.matrix(mtcars[, -11]))
sqrt(mean((pred - mtcars[, 11]) ^ 2))
5 changes: 3 additions & 2 deletions R-package/man/a-compatibility-note-for-saveRDS-save.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 80ce18b

Please sign in to comment.