Skip to content

Commit

Permalink
Merge pull request #426 from tlverse/rm-imputeMissings
Browse files Browse the repository at this point in the history
Rm impute missings
  • Loading branch information
blind-contours authored Oct 4, 2023
2 parents 64806d5 + b75abeb commit 507e0a1
Show file tree
Hide file tree
Showing 12 changed files with 81 additions and 30 deletions.
1 change: 0 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ Imports:
ggplot2,
digest,
Rdpack,
imputeMissings,
dplyr,
caret,
ROCR
Expand Down
1 change: 0 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,6 @@ importFrom(ggplot2,geom_point)
importFrom(ggplot2,ggplot)
importFrom(ggplot2,labs)
importFrom(ggplot2,scale_x_discrete)
importFrom(imputeMissings,impute)
importFrom(methods,is)
importFrom(origami,combiner_c)
importFrom(origami,cross_validate)
Expand Down
74 changes: 56 additions & 18 deletions R/process_data.R
Original file line number Diff line number Diff line change
Expand Up @@ -51,35 +51,34 @@
#' when creating the \code{\link{sl3_Task}} by setting
#' \code{drop_missing_outcome = TRUE}.
#'
#' @importFrom imputeMissings impute
#'
#' @return A list of processed data, nodes and column names
#'
#' @export
process_data <- function(data, nodes, column_names, flag = TRUE,
drop_missing_outcome = FALSE) {

# force a copy so we can mutate data in place w/o distrupting a user's data
if (inherits(data, "data.table")) {
data <- data.table::copy(data)
} else {
data <- as.data.table(data)
}

all_nodes <- unlist(nodes)


if (length(all_nodes) == 0) {
return(list(data = data, nodes = nodes, column_names = column_names))
}
node_columns <- unlist(column_names[all_nodes])
covariates_columns <- unlist(column_names[nodes$covariates])
outcome_columns <- unlist(column_names[nodes$outcome])

factorized <- FALSE
dropped <- FALSE
imputed <- FALSE

# process characters
is_character <- which(data[, sapply(.SD, is.character), .SDcols = node_columns])
char_cols <- node_columns[is_character]
Expand All @@ -89,23 +88,23 @@ process_data <- function(data, nodes, column_names, flag = TRUE,
"Character variables found: %s;\nConverting these to factors",
paste0(char_vars, collapse = ", ")
))

# convert data
for (char_col in char_cols) {
set(data, , char_col, as.factor(unlist(data[, char_col, with = FALSE])))
}
factorized <- TRUE
}

# process missing
has_missing <- data[, sapply(.SD, function(x) any(is.na(x))), .SDcols = node_columns]
miss_cols <- node_columns[has_missing]
miss_vars <- all_nodes[has_missing]

missing_Y <- any(nodes$outcome %in% miss_vars)
missing_covar_cols <- intersect(miss_cols, covariates_columns)
missing_covar_vars <- intersect(miss_vars, nodes$covariates)

if (length(miss_cols) > 0) {
if (missing_Y && drop_missing_outcome) {
if (flag) {
Expand All @@ -114,36 +113,75 @@ process_data <- function(data, nodes, column_names, flag = TRUE,
keep_rows <- stats::complete.cases(data[, outcome_columns, with = FALSE])
data <- data[keep_rows, ]
}

if (length(missing_covar_cols) > 0) {
warning(sprintf(
"Imputing missing values and adding missingness indicators for the following covariates with missing values: %s. See documentation of the process_data function for details.",
paste0(missing_covar_cols, collapse = ", ")
))
# make indicators and add to data
missing_indicators <- data[, lapply(.SD, function(x) as.numeric(!is.na(x))),
.SDcols = missing_covar_cols
.SDcols = missing_covar_cols
]

missing_indicator_cols <- sprintf("delta_%s", missing_covar_cols)
missing_indicator_vars <- sprintf("delta_%s", missing_covar_vars)
setnames(missing_indicators, missing_indicator_cols)
set(data, , missing_indicator_cols, missing_indicators)

# add inidicators to column map and covariate list
column_names[missing_indicator_vars] <- missing_indicator_cols
nodes$covariates <- c(nodes$covariates, missing_indicator_vars)
}
# impute covariates
imputed <- impute(data[, missing_covar_cols, with = FALSE], flag = FALSE)

data_missing_covars <- data[, missing_covar_cols, with = FALSE]
imputed <- impute(data.frame(data_missing_covars))

# update data
set(data, , missing_covar_cols, imputed)
}

na_Y <- (!is.null(nodes$outcome) && any(is.na(data[, outcome_columns, with = F])))
if (na_Y && flag) {
warning("Missing outcome data detected. This is okay for prediction, but will likely break training. \n You can drop observations with missing outcomes by setting drop_missing_outcome=TRUE in make_sl3_Task.")
}
list(data = data, nodes = nodes, column_names = column_names)
}

#' Impute missing values with the median/mode
#' based on imputeMissings R package (removed from CRAN)
#'
#' Character vectors and factors are imputed with the mode.
#' Numeric and integer vectors are imputed with median.
#'
#' @param data A data frame with dummies or numeric variables.
#'
#' @keywords internal
impute <- function(data){
compute <- function (data){
Mode <- function(x) {
xtab <- table(x)
xmode <- names(which(xtab == max(xtab)))
return(xmode[1])
}
values <- sapply(data, function(x) {
if (class(x) %in% c("character", "factor"))
Mode(x)
else if (class(x) %in% c("numeric", "integer"))
median(x, na.rm = TRUE)
}, simplify = FALSE)
values
}
object <- compute(data)
if (!identical(colnames(data), names(object))){
stop('Variable names and variable positions need to be identical in compute and impute')
}
data <- data.frame(sapply(1:ncol(data), function(i) {
fact <- is.factor(data[,i])
if (fact) data[,i] <- as.character(data[,i])
data[is.na(data[,i]),i] <- object[[i]]
if (fact) data[,i] <- as.factor(data[,i])
return(data[,i,drop=FALSE])
}, simplify = FALSE))
data
}
1 change: 1 addition & 0 deletions man/Lrnr_density_hse.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/Lrnr_grf.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions man/Lrnr_h2o_glm.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions man/Lrnr_multivariate.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions man/Lrnr_screener_augment.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions man/Lrnr_screener_coefs.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions man/Lrnr_screener_correlation.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 1 addition & 2 deletions man/Lrnr_subset_covariates.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions man/impute.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 507e0a1

Please sign in to comment.