From a7cc54f2b1ab920adf8fac3e6cd460cbef57ecc0 Mon Sep 17 00:00:00 2001 From: blind-contours Date: Thu, 28 Sep 2023 20:30:17 -0600 Subject: [PATCH 1/4] removing dependency for imputeMissings --- DESCRIPTION | 1 - R/process_data.R | 71 ++++++++++++++++++++++++++++++++++++------------ 2 files changed, 54 insertions(+), 18 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index d91c7234..af423e44 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -44,7 +44,6 @@ Imports: ggplot2, digest, Rdpack, - imputeMissings, dplyr, caret, ROCR diff --git a/R/process_data.R b/R/process_data.R index a93c0ffb..108f854d 100644 --- a/R/process_data.R +++ b/R/process_data.R @@ -51,35 +51,34 @@ #' when creating the \code{\link{sl3_Task}} by setting #' \code{drop_missing_outcome = TRUE}. #' -#' @importFrom imputeMissings impute #' #' @return A list of processed data, nodes and column names #' #' @export process_data <- function(data, nodes, column_names, flag = TRUE, drop_missing_outcome = FALSE) { - + # force a copy so we can mutate data in place w/o distrupting a user's data if (inherits(data, "data.table")) { data <- data.table::copy(data) } else { data <- as.data.table(data) } - + all_nodes <- unlist(nodes) - - + + if (length(all_nodes) == 0) { return(list(data = data, nodes = nodes, column_names = column_names)) } node_columns <- unlist(column_names[all_nodes]) covariates_columns <- unlist(column_names[nodes$covariates]) outcome_columns <- unlist(column_names[nodes$outcome]) - + factorized <- FALSE dropped <- FALSE imputed <- FALSE - + # process characters is_character <- which(data[, sapply(.SD, is.character), .SDcols = node_columns]) char_cols <- node_columns[is_character] @@ -89,23 +88,23 @@ process_data <- function(data, nodes, column_names, flag = TRUE, "Character variables found: %s;\nConverting these to factors", paste0(char_vars, collapse = ", ") )) - + # convert data for (char_col in char_cols) { set(data, , char_col, as.factor(unlist(data[, char_col, with = FALSE]))) } factorized <- TRUE } - + # process missing has_missing <- data[, sapply(.SD, function(x) any(is.na(x))), .SDcols = node_columns] miss_cols <- node_columns[has_missing] miss_vars <- all_nodes[has_missing] - + missing_Y <- any(nodes$outcome %in% miss_vars) missing_covar_cols <- intersect(miss_cols, covariates_columns) missing_covar_vars <- intersect(miss_vars, nodes$covariates) - + if (length(miss_cols) > 0) { if (missing_Y && drop_missing_outcome) { if (flag) { @@ -114,7 +113,7 @@ process_data <- function(data, nodes, column_names, flag = TRUE, keep_rows <- stats::complete.cases(data[, outcome_columns, with = FALSE]) data <- data[keep_rows, ] } - + if (length(missing_covar_cols) > 0) { warning(sprintf( "Imputing missing values and adding missingness indicators for the following covariates with missing values: %s. See documentation of the process_data function for details.", @@ -122,28 +121,66 @@ process_data <- function(data, nodes, column_names, flag = TRUE, )) # make indicators and add to data missing_indicators <- data[, lapply(.SD, function(x) as.numeric(!is.na(x))), - .SDcols = missing_covar_cols + .SDcols = missing_covar_cols ] - + missing_indicator_cols <- sprintf("delta_%s", missing_covar_cols) missing_indicator_vars <- sprintf("delta_%s", missing_covar_vars) setnames(missing_indicators, missing_indicator_cols) set(data, , missing_indicator_cols, missing_indicators) - + # add inidicators to column map and covariate list column_names[missing_indicator_vars] <- missing_indicator_cols nodes$covariates <- c(nodes$covariates, missing_indicator_vars) } # impute covariates imputed <- impute(data[, missing_covar_cols, with = FALSE], flag = FALSE) - + # update data set(data, , missing_covar_cols, imputed) } - + na_Y <- (!is.null(nodes$outcome) && any(is.na(data[, outcome_columns, with = F]))) if (na_Y && flag) { warning("Missing outcome data detected. This is okay for prediction, but will likely break training. \n You can drop observations with missing outcomes by setting drop_missing_outcome=TRUE in make_sl3_Task.") } list(data = data, nodes = nodes, column_names = column_names) } + +#' Impute missing values with the median/mode +#' based on imputeMissings R package (removed from CRAN) +#' +#' Character vectors and factors are imputed with the mode. +#' Numeric and integer vectors are imputed with median. +#' +#' @param data A data frame with dummies or numeric variables. +#' +#' @keywords internal +impute <- function(data){ + compute <- function (data){ + Mode <- function(x) { + xtab <- table(x) + xmode <- names(which(xtab == max(xtab))) + return(xmode[1]) + } + values <- sapply(data, function(x) { + if (class(x) %in% c("character", "factor")) + Mode(x) + else if (class(x) %in% c("numeric", "integer")) + median(x, na.rm = TRUE) + }, simplify = FALSE) + values + } + object <- compute(data) + if (!identical(colnames(data), names(object))){ + stop('Variable names and variable positions need to be identical in compute and impute') + } + data <- data.frame(sapply(1:ncol(data), function(i) { + fact <- is.factor(data[,i]) + if (fact) data[,i] <- as.character(data[,i]) + data[is.na(data[,i]),i] <- object[[i]] + if (fact) data[,i] <- as.factor(data[,i]) + return(data[,i,drop=FALSE]) + }, simplify = FALSE)) + data +} From a1eb91cbf1bb2f8076259a5702dfc68a56bcce36 Mon Sep 17 00:00:00 2001 From: blind-contours Date: Thu, 28 Sep 2023 20:31:12 -0600 Subject: [PATCH 2/4] make doc changes --- NAMESPACE | 1 - man/Lrnr_density_hse.Rd | 1 + man/Lrnr_grf.Rd | 1 + man/Lrnr_h2o_glm.Rd | 1 + man/Lrnr_multivariate.Rd | 3 +-- man/Lrnr_screener_augment.Rd | 3 +-- man/Lrnr_screener_coefs.Rd | 3 +-- man/Lrnr_screener_correlation.Rd | 3 +-- man/Lrnr_subset_covariates.Rd | 3 +-- man/impute.Rd | 17 +++++++++++++++++ 10 files changed, 25 insertions(+), 11 deletions(-) create mode 100644 man/impute.Rd diff --git a/NAMESPACE b/NAMESPACE index 3695bee1..189530bd 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -164,7 +164,6 @@ importFrom(ggplot2,geom_point) importFrom(ggplot2,ggplot) importFrom(ggplot2,labs) importFrom(ggplot2,scale_x_discrete) -importFrom(imputeMissings,impute) importFrom(methods,is) importFrom(origami,combiner_c) importFrom(origami,cross_validate) diff --git a/man/Lrnr_density_hse.Rd b/man/Lrnr_density_hse.Rd index b167820f..c8c63e0b 100644 --- a/man/Lrnr_density_hse.Rd +++ b/man/Lrnr_density_hse.Rd @@ -45,6 +45,7 @@ task <- sl3_Task$new( covariates = c("apgar1", "apgar5", "parity", "gagebrth", "mage", "meducyrs"), outcome = "haz" ) + # train density hse learner and make predictions lrnr_density_hse <- Lrnr_density_hse$new(mean_learner = Lrnr_glm$new()) fit_density_hse <- lrnr_density_hse$train(task) diff --git a/man/Lrnr_grf.Rd b/man/Lrnr_grf.Rd index 550aefea..8ab9e911 100644 --- a/man/Lrnr_grf.Rd +++ b/man/Lrnr_grf.Rd @@ -87,6 +87,7 @@ task <- sl3_Task$new( covariates = c("apgar1", "apgar5", "parity", "gagebrth", "mage", "meducyrs"), outcome = "haz" ) + # train grf learner and make predictions lrnr_grf <- Lrnr_grf$new(seed = 123) lrnr_grf_fit <- lrnr_grf$train(task) diff --git a/man/Lrnr_h2o_glm.Rd b/man/Lrnr_h2o_glm.Rd index 52bf5678..254876f0 100644 --- a/man/Lrnr_h2o_glm.Rd +++ b/man/Lrnr_h2o_glm.Rd @@ -74,6 +74,7 @@ task <- sl3_Task$new( covariates = c("apgar1", "apgar5", "parity", "gagebrth", "mage", "meducyrs"), outcome = "haz" ) + # train h2o glm learner and make predictions lrnr_h2o <- Lrnr_h2o_glm$new() lrnr_h2o_fit <- lrnr_h2o$train(task) diff --git a/man/Lrnr_multivariate.Rd b/man/Lrnr_multivariate.Rd index 28586b8b..deff4f5b 100644 --- a/man/Lrnr_multivariate.Rd +++ b/man/Lrnr_multivariate.Rd @@ -52,8 +52,7 @@ covariates <- grep("W", names(data), value = TRUE) outcomes <- grep("Y", names(data), value = TRUE) # make sl3 task -task <- sl3_Task$new( - data = data.table::copy(data), +task <- sl3_Task$new(data.table::copy(data), covariates = covariates, outcome = outcomes ) diff --git a/man/Lrnr_screener_augment.Rd b/man/Lrnr_screener_augment.Rd index 3e5be646..d3bd716f 100644 --- a/man/Lrnr_screener_augment.Rd +++ b/man/Lrnr_screener_augment.Rd @@ -40,8 +40,7 @@ covars <- c( outcome <- "haz" # create sl3 task -task <- sl3_Task$new( - data = data.table::copy(cpp_imputed), +task <- sl3_Task$new(data.table::copy(cpp_imputed), covariates = covars, outcome = outcome ) diff --git a/man/Lrnr_screener_coefs.Rd b/man/Lrnr_screener_coefs.Rd index b3f4b143..31deea66 100644 --- a/man/Lrnr_screener_coefs.Rd +++ b/man/Lrnr_screener_coefs.Rd @@ -42,8 +42,7 @@ covars <- c( outcome <- "haz" # create sl3 task -task <- sl3_Task$new( - data = data.table::copy(cpp_imputed), +task <- sl3_Task$new(data.table::copy(cpp_imputed), covariates = covars, outcome = outcome ) diff --git a/man/Lrnr_screener_correlation.Rd b/man/Lrnr_screener_correlation.Rd index 10acfcc5..a715c9f8 100644 --- a/man/Lrnr_screener_correlation.Rd +++ b/man/Lrnr_screener_correlation.Rd @@ -48,8 +48,7 @@ covars <- c( outcome <- "haz" # create sl3 task -task <- sl3_Task$new( - data = data.table::copy(cpp_imputed), +task <- sl3_Task$new(data.table::copy(cpp_imputed), covariates = covars, outcome = outcome ) diff --git a/man/Lrnr_subset_covariates.Rd b/man/Lrnr_subset_covariates.Rd index d887ee6f..33763b54 100644 --- a/man/Lrnr_subset_covariates.Rd +++ b/man/Lrnr_subset_covariates.Rd @@ -42,8 +42,7 @@ covars <- c("apgar1", "apgar5", "parity", "gagebrth", "mage", "meducyrs", "sexn" outcome <- "haz" # create sl3 task -task <- sl3_Task$new( - data = data.table::copy(cpp_imputed), +task <- sl3_Task$new(data.table::copy(cpp_imputed), covariates = covars, outcome = outcome, folds = origami::make_folds(cpp_imputed, V = 3) diff --git a/man/impute.Rd b/man/impute.Rd new file mode 100644 index 00000000..557fd969 --- /dev/null +++ b/man/impute.Rd @@ -0,0 +1,17 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/process_data.R +\name{impute} +\alias{impute} +\title{Impute missing values with the median/mode +based on imputeMissings R package (removed from CRAN)} +\usage{ +impute(data) +} +\arguments{ +\item{data}{A data frame with dummies or numeric variables.} +} +\description{ +Character vectors and factors are imputed with the mode. +Numeric and integer vectors are imputed with median. +} +\keyword{internal} From 7433178c33fbfd43fb1daa2ea49180e645f38e3a Mon Sep 17 00:00:00 2001 From: "Rachael V. Phillips" <15657150+rachaelvp@users.noreply.github.com> Date: Fri, 29 Sep 2023 00:35:31 -0700 Subject: [PATCH 3/4] rm omitted argument in impute fxn --- R/process_data.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/process_data.R b/R/process_data.R index 108f854d..d1a31c6c 100644 --- a/R/process_data.R +++ b/R/process_data.R @@ -134,7 +134,7 @@ process_data <- function(data, nodes, column_names, flag = TRUE, nodes$covariates <- c(nodes$covariates, missing_indicator_vars) } # impute covariates - imputed <- impute(data[, missing_covar_cols, with = FALSE], flag = FALSE) + imputed <- impute(data[, missing_covar_cols, with = FALSE]) # update data set(data, , missing_covar_cols, imputed) From b75abeba5f6a40acd072e3057054da57060b9d0f Mon Sep 17 00:00:00 2001 From: blind-contours Date: Tue, 3 Oct 2023 19:56:23 -0600 Subject: [PATCH 4/4] fixed the data.table indexing passed to impute --- R/process_data.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/R/process_data.R b/R/process_data.R index d1a31c6c..982d9dae 100644 --- a/R/process_data.R +++ b/R/process_data.R @@ -134,7 +134,8 @@ process_data <- function(data, nodes, column_names, flag = TRUE, nodes$covariates <- c(nodes$covariates, missing_indicator_vars) } # impute covariates - imputed <- impute(data[, missing_covar_cols, with = FALSE]) + data_missing_covars <- data[, missing_covar_cols, with = FALSE] + imputed <- impute(data.frame(data_missing_covars)) # update data set(data, , missing_covar_cols, imputed)