stan-dev · jsocolar · May 21, 2021 · May 21, 2021 · May 21, 2021 · May 21, 2021
diff --git a/NAMESPACE b/NAMESPACE
@@ -206,6 +206,8 @@ S3method(reserved_variables,draws_df)
 S3method(reserved_variables,draws_list)
 S3method(reserved_variables,draws_matrix)
 S3method(reserved_variables,draws_rvars)
+S3method(rollup_summary,default)
+S3method(rollup_summary,draws_summary)
 S3method(sd,default)
 S3method(sd,rvar)
 S3method(split_chains,draws)
@@ -325,6 +327,7 @@ export(resample_draws)
 export(rfun)
 export(rhat)
 export(rhat_basic)
+export(rollup_summary)
 export(rstar)
 export(rvar)
 export(rvar_all)

diff --git a/R/parse_variable_indices.R b/R/parse_variable_indices.R
@@ -0,0 +1,135 @@
+#' Parse the indices from a vector of variables (extracted e.g. from a 
+#' `draws_summary` object or a `draws` object)
+#' 
+#' @name variable_indices
+#' @param x a character vector of variables
+#'
+#' @return
+#' A list with index information for each unique variable name V in `x`. Top-level list names are 
+#' the variable names. Each element contains: 
+#'    $ndim  the number of dimensions of V. Returns 0 for scalars with no brackets
+#'    but 1 for `y[1]` even if `y` has no other entries in `x`.
+#'    
+#'    $dim  a vector of the actual dimensions of V, as determined by the number of unique
+#'    elements at each index position. Set to `NA` if the ndim is zero.
+#'    
+#'    $implied_dim  a vector of the implied dimensions of V, where any position in V that 
+#'    contains exclusively integers is filled in to include all integers from the lesser of one 
+#'    and its minimum up to its maximum. Set to `NA` if ndim is zero.
+#'    
+#'    $index_names  a list of length corresponding to ndim, where each element is the
+#'    unique levels of the corresponding index if the index is parsed as factor, and NULL otherwise.
+#'    Set to `NULL` if ndim is zero.
+#'    
+#'    $indices  if ndim is zero, returns 1.
+#'    if ndim is 1 or greater, returns a dataframe of every implied combination of indices
+#'    
+#'    $position  the position of each combination of indices from $indices in the the argument `x`
+#' 
+#' @details
+#' Assumes that variable indexing uses square brackets in the variable names
+#' 
+NULL
+
+parse_variable_indices <- function(x){
+  vars_indices <- strsplit(x, "(\\[|\\])")
+  vars <- sapply(vars_indices, `[[`, 1)
+  var_names <- unique(vars)
+  # Check that no variables contain unpaired or non-terminal square brackets
+  if_indexed <- lengths(vars_indices) > 1
+  if_indexed2 <- grepl("\\[.*\\]$", x)
+  bracket_problems <- grepl("\\].|\\[.*\\[|\\[,|,\\]|,,", x)
+  if (any(bracket_problems) | (!identical(if_indexed, if_indexed2))) {
+    stop_no_call(paste("Some variable names contain unpaired square brackets,",
+                  "missing indices, or are multi-indexed."))
+  }
+  # Get ndim. Variables with no brackets are given as ndim zero.
+  # Variables with brackets are given ndim 1, even if they contain just one element.
+  ndim_elementwise <- sapply(vars_indices, function(x){
+    if (length(x) == 2) {
+      out <- length(strsplit(x[2], ",")[[1]]) # number of commas plus one
+    } else {
+      out <- 0
+    }
+    out
+  })
+  ndim <- sapply(var_names, function(x){
+    out <- unique(ndim_elementwise[vars == x])
+    if (length(out) != 1) {
+      stop_no_call(paste0("Inconsistent indexing found for variable ", x, " ."))
+    }
+    out
+  })
+  variable_indices_info <- lapply(var_names, function (x) {
+    indices_info <- named_list(c("ndim", "dimensions", "implied_dimensions", "index_names", "indices", "position"))
+    indices_info$ndim <- ndim[[x]]
+    var_i <- vars == x
+    var_length <- sum(var_i)
+    if (ndim[x] == 0) {
+      # single variable, no indices
+      indices_info$dimensions <- indices_info$implied_dimensions <- NA
+      indices_info$indices <- 1L
+      indices_info$implied_dimensions <- NA
+      indices_info$internal_position <- 1
+    } else {
+      indices <- sapply(vars_indices[var_i], `[[`, 2)
+      indices <- as.data.frame(do.call(rbind, strsplit(indices, ",")),
+                               stringsAsFactors = FALSE)
+      indices_info$dimensions <- unname(apply(indices, 2, function(x){length(unique(x))}))
+      unique_indices <- vector("list", length(indices))
+      .dimnames <- vector("list", length(indices))
+      names(unique_indices) <- names(indices)
+      for (i in seq_along(indices)) {
+        numeric_index <- suppressWarnings(as.numeric(indices[[i]]))
+        if (!anyNA(numeric_index) && rlang::is_integerish(numeric_index)) {
+          # for integer indices, we need to convert them to integers
+          # so that we can sort them in numerical order (not string order)
+          indices[[i]] <- as.integer(numeric_index)
+          if (min(numeric_index) >= 1) {
+            # integer indices >= 1 are forced to lower bound of 1 + no dimnames
+            unique_indices[[i]] <- seq.int(1, max(numeric_index))
+          } else {
+            # indices with values < 1 are filled in between the min and max
+            unique_indices[[i]] <- seq.int(min(numeric_index), max(numeric_index))
+          }
+        } else {
+          # we convert non-numeric indices to factors so that we can force them
+          # to be ordered as they appear in the data (rather than in alphabetical order)
+          factor_levels <- unique(indices[[i]])
+          indices[[i]] <- factor(indices[[i]], levels = factor_levels)
+          # these aren't sorted so they appear in original order
+          unique_indices[[i]] <- factor(factor_levels, levels = factor_levels)
+          .dimnames[[i]] <- unique_indices[[i]]
+        }
+      }
+
+      indices_info$index_names <- .dimnames
+      indices_info$implied_dimensions <- unname(lengths(unique_indices))
+
+      # sort indices and fill in missing indices as NA to ensure
+      # (1) even if the order of the variables is something weird (like 
+      # x[2,2] comes before x[1,1]) the result
+      # places those columns in the correct cells of the array
+      # (2) if some combination of indices is missing (say x[2,1] isn't
+      # in the input) that cell in the array gets an NA
+
+      # Use expand.grid to get all cells in output array. We reverse indices
+      # here because it helps us do the sort after the merge, where
+      # we need to sort in reverse order of the indices (because
+      # the value of the last index should move slowest)
+      all_indices <- expand.grid(rev(unique_indices))
+      # merge with all.x = TRUE (left join) to fill in missing cells with NA
+      indices <- merge(all_indices, cbind(indices, index = seq_len(nrow(indices))),
+                       all.x = TRUE, sort = FALSE)
+      # need to do the sort manually after merge because when sort = TRUE, merge
+      # sorts factors as if they were strings, and we need factors to be sorted as factors
+      indices <- indices[do.call(order, as.list(indices[, -ncol(indices), drop = FALSE])),]
+
+      indices_info$indices <- unname(rev(indices[, names(indices) != "index", drop = FALSE]))
+      indices_info$position <- which(var_i)[indices$index]
+    }
+    indices_info
+  })
+  names(variable_indices_info) <- var_names
+  variable_indices_info
+}
diff --git a/R/rollup_summary.R b/R/rollup_summary.R
@@ -0,0 +1,113 @@
+#' "Roll up" `draws_summary` objects by collapsing over nonscalar parameters.
+#' 
+#' By default, all variables with names matched by `\\[.*\\]$` are rolled up, 
+#' but there is an option to pass a list of parameter names, which will roll up
+#' any variables matched by `^parameter_name\\[.*\\]$`
+#' 
+#' @name draws_summary_rollup
+#' @param x a `draws_summary` object or a `draws` object to be summarised
+#' @param rollup_vars a list of variable names (excluding brackets and indices) to roll up
+#' @param min_only a character vector of varable names for which only minimum values are 
+#'    desired in the rollup
+#' @param max_only a character vector of varable names for which only maximum values are 
+#'    desired in the rollup
+
+#' @return
+#' The `rollup_summary()` methods return a list of [tibble][tibble::tibble] data frames.
+#' The first element is a standard `draws_summary` for the variables that are not rolled up
+#' The second element is a rollup of the variables to be rolled up and contains max and min
+#' values of the summary functions attained by any element of the variable
+#' 
+#' @details
+#' By default, only the maximum value of `rhat` and the minimum values of [ess_bulk()] and 
+#' [ess_tail()] are returned.  # INSERT HOW WE HANDLE NA SUMMARIES
+#' 
+#' @examples
+#' ds <- summarise_draws(example_draws())
+#' ds2 <- summarise_draws(2 * example_draws())
+#' ds2$variable <- c("pi", "upsilon", 
+#'                   "omega[1,1]", "omega[2,1]", "omega[3,1]", "omega[4,1]",
+#'                   "omega[1,2]", "omega[2,2]", "omega[3,2]", "omega[4,2]")
+#' draws_summary <- rbind(ds, ds2)
+#' rollup_summary(draws_summary)
+#' rollup_summary(draws_summary, rollup_vars = "theta")
+#' rollup_summary(example_draws())
+NULL
+
+#' @rdname draws_summary_rollup
+#' @export
+rollup_summary <- function(x, rollup_vars = NULL,
+                           min_only = c("ess_bulk", "ess_tail"),
+                           max_only = "rhat") {
+  UseMethod("rollup_summary")
+}
+
+#' @rdname draws_summary_rollup
+#' @export
+rollup_summary.default <- function(x, rollup_vars = NULL,
+                                   min_only = c("ess_bulk", "ess_tail"),
+                                   max_only = "rhat") {
+  rollup_summary(summarise_draws(x), rollup_vars = rollup_vars,
+                 min_only = min_only,
+                 max_only = max_only)
+}
+
+#' @rdname draws_summary_rollup
+#' @export
+rollup_summary.draws_summary <- function (x, rollup_vars = NULL,
+                            min_only = c("ess_bulk", "ess_tail"),
+                            max_only = "rhat") {
+  # get variable names
+  vars <- draws_summary$variable
+  # Determine which variable names need to be rolled up
+  if (is.null(rollup_vars)) {
+    vars_nonscalar <- grepl("\\[", vars)
+  } else {
+    vars_nonscalar <- as.logical(colSums(do.call(rbind, 
+                                                 lapply(paste0("^", rollup_vars, "\\["),
+                                                        function(x){grepl(x, vars)}))))
+  }
+  # Separate out draws_summary into the scalar variables to leave alone and the nonscalar
+  # variables for rollup
+  ds_scalar <- draws_summary[!vars_nonscalar, ]
+  ds_nonscalar <- draws_summary[vars_nonscalar, ]
+  # Roll up the nonscalar variables
+  varnames_nonscalar <- gsub("\\[(.*)", "", ds_nonscalar$variable)
+  summary_names <- names(draws_summary)[-1]
+  names_minmax <- summary_names[!(summary_names %in% c(min_only, max_only))]
+  split_nonscalar <- split(ds_nonscalar, varnames_nonscalar)[unique(varnames_nonscalar)] 
+                          # [unique(varnames_nonscalar)] preserves the order of the names
+  min_max <- do.call(rbind, lapply(split_nonscalar, rollup_helper_minmax, 
+                                   names = names_minmax))
+  min_only <- do.call(rbind, lapply(split_nonscalar, rollup_helper_min, names = min_only))
+  max_only <- do.call(rbind, lapply(split_nonscalar, rollup_helper_max, names = max_only))
+  variable_column <- data.frame("variable" = unique(varnames_nonscalar))
+  variable_indices <- parse_variable_indices(ds_nonscalar$variable)
+  dimension_column <- data.frame("dimension" = paste0("(", 
+                                                      sapply(variable_indices, function(x){paste(x$dimensions, collapse = ",")}),
+                                                      ")"))
+  nonscalar_out <- tibble::as_tibble(cbind(variable_column, dimension_column, min_max, max_only, min_only))
+  out <- list(unrolled_vars = ds_scalar, rolled_vars = nonscalar_out)
+  out
+}
+
+rollup_helper_minmax <- function(x, names){
+  x <- x[, names]
+  mm <- c(apply(x, 2, function(x) {c(min(x), max(x))}))
+  names(mm) <- paste0(rep(names(x), each = 2), c("_min", "_max"))
+  mm
+}
+
+rollup_helper_min <- function(x, names){
+  x <- x[, names]
+  min_only <- apply(x, 2, min)
+  names(min_only) <- paste0(names(x), "_min")
+  min_only
+}
+
+rollup_helper_max <- function(x, names){
+  x <- x[, names]
+  max_only <- apply(x, 2, max)
+  names(max_only) <- paste0(names(x), "_max")
+  max_only
+}
diff --git a/man/draws_summary_rollup.Rd b/man/draws_summary_rollup.Rd
diff --git a/man/variable_indices.Rd b/man/variable_indices.Rd