kkbrum
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎DESCRIPTION‎
Lines changed: 11 additions & 6 deletions b/‎DESCRIPTION‎
Lines changed: 11 additions & 6 deletions
diff --git a/‎NAMESPACE‎
Lines changed: 4 additions & 0 deletions b/‎NAMESPACE‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎NEWS.md‎
Lines changed: 29 additions & 0 deletions b/‎NEWS.md‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎R/balance_LP.R‎
Lines changed: 60 additions & 19 deletions b/‎R/balance_LP.R‎
Lines changed: 60 additions & 19 deletions
diff --git a/‎R/check_balance.R‎
Lines changed: 53 additions & 34 deletions b/‎R/check_balance.R‎
Lines changed: 53 additions & 34 deletions
@@ -7,3 +7,5 @@ Meta
 .httr-oauth
 .DS_Store
 docs
+/doc/
+/Meta/
@@ -1,14 +1,17 @@
 Package: natstrat
 Type: Package
 Title: Obtain Unweighted Natural Strata that Balance Many Covariates
-Version: 1.0.0
+Version: 2.0.0
 Authors@R: c(
     person("Katherine", "Brumberg", email = "kbrum@wharton.upenn.edu", 
         role=c("aut", "cre")))
-Description: Natural strata fix a constant ratio of controls to treated units within 
-    each stratum. This ratio need not be an integer. The control units are 
-    chosen using randomized rounding of a linear program that balances many 
-    covariates.
+Description: Natural strata can be used in observational studies to balance
+    the distributions of many covariates across any number of treatment
+    groups and any number of comparisons. These strata have proportional 
+    amounts of units within each stratum across the treatments, allowing 
+    for simple interpretation and aggregation across strata. Within each 
+    stratum, the units are chosen using randomized rounding of a linear 
+    program that balances many covariates.
     To solve the linear program, the 'Gurobi' commercial optimization software 
     is recommended, but not required. The 'gurobi' R package can be installed following the instructions 
     at <https://www.gurobi.com/documentation/9.1/refman/ins_the_r_package.html>.
@@ -25,7 +28,9 @@ Imports:
     pps,
     sampling,
     ggplot2,
-    rlang
+    rlang,
+    ramify,
+    slam
 Depends:
     R (>= 2.10),
     caret
 
@@ -7,6 +7,8 @@ export(generate_qs)
 export(optimize_controls)
 export(stand)
 import(ggplot2)
+import(ramify)
+import(slam)
 importFrom(caret,dummyVars)
 importFrom(rlang,.data)
 importFrom(stats,as.formula)
@@ -15,6 +17,8 @@ importFrom(stats,median)
 importFrom(stats,model.frame.default)
 importFrom(stats,na.pass)
 importFrom(stats,predict)
+importFrom(stats,rbinom)
+importFrom(stats,rmultinom)
 importFrom(stats,sd)
 importFrom(stats,setNames)
 importFrom(stats,terms)
 
@@ -1,3 +1,32 @@
+# natstrat 2.0.0 (2021-10-12)
+
+This version adds several new functionalities:
+  * Multiple treatment or control groups
+  * Multiple separate comparisons, using various subsets of the treatment and control groups,
+  for which units are chosen in order to balance covariate distributions 
+  for all comparisons simultaneously
+
+Several changes to the interface have been made:
+
+* `z` should generally be a factor instead of a vector as before
+* `treated` and `control` specifications, if needed, should each be a level of `z`
+* many arguments can be specified for each of the treatment levels
+    * `q_s`, `max_entry_s` can have a row per treatment level
+    * `ratio`, `max_ratio` can have an entry per treatment level
+* inputs for the supplemental comparison have been added across the functions: `q_star_s`,
+`ratio_star`, `treated_star`, `weight_star`
+    
+There are several changes to the outputs:
+
+* `optimize_controls` now has only one version of `eps`, `objective`, `objective_wo_importances` 
+instead of a raw and regular version. The version now reported is the raw version, not corrected
+for missingness. If you would like corrected versions, refer to the standardized differences
+outputted by `check_balance` instead
+* `generate_constraints` now returns only standardized outputs, not centered. The centering
+now takes place within `optimize_controls` instead
+
+
+
 # natstrat 1.0.0 (2021-05-17)
 
 The first released version.
@@ -4,7 +4,10 @@
 #' to select.
 #'
 #' @inheritParams optimize_controls
-#' @param q_s a named vector indicating how many control units are to be selected from each stratum.
+#' @param q_s a named vector or matrix indicating how many control units are to be selected from each stratum.
+#'   If there is one control group and all treated units are desired, this can be a vector; otherwise,
+#'   this should have one row per treatment group, where the order of the rows matches the order of
+#'   the levels of \code{z}, including the treated level.
 #' @param st_vals the unique stratum levels contained in \code{st}.
 #' @param S the number of unique stratum levels contained in \code{st}.
 #' @param N the total number of available controls in the data.
@@ -19,41 +22,77 @@
 #' }
 #'
 #' @keywords internal
+#' @import ramify
+#' @import slam
 
 balance_LP <- function(z, X, importances, st, st_vals, S, q_s, N,
-                       solver, integer, time_limit) {
+                       solver, integer, time_limit, threads = 1,
+                       weight_comp = 1) {
+
   if (solver == "gurobi" && !requireNamespace("gurobi", quietly = TRUE)) {
     stop("Package \'gurobi\' needed if \"solver\" parameter set to \"gurobi\". Please
          install it or switch the \"solver\" parameter to \"Rglpk\".",
          call. = FALSE)
   }
+  groups <- levels(z)
+  k <- length(groups)
+  kc2 <- choose(k, 2)
+  n_comp <- length(q_s)
 
   # Set up and solve the linear program
   model <- list()
-  params <- list(TimeLimit = time_limit, OutputFlag = 0)
+  params <- list(TimeLimit = time_limit, OutputFlag = 0, Threads = threads)
 
   nvars <- dim(X)[2]  # number of variables
-  X[is.na(X)] <- 0
-  X0 <- X[z == 0, ]
-  model$obj <- c(rep(0, N), rep(importances, 2))
-  blk1 <- t(X0)
-  ident <- diag(1, nvars, nvars)  # identity matrix
-  model$A <- cbind(blk1 / sum(q_s), ident, -ident)  # constraints, individual vars
+  model$obj <- rep(0, n_comp * N)
+  for (comp in 1:n_comp) {
+    model$obj <- c(model$obj, rep(rep(importances * weight_comp[comp], 2), kc2))
+  }
+  model$A <- create_balance_matrices(X = X, z = z, N = N, nvars = nvars,
+                          kc2 = kc2, q_s = q_s, return = "A")$A
+
+    # Now, append stratum size constraints for each comparison
+  st_mats <- simple_triplet_zero_matrix(nrow = k * S, ncol = N)
+  for (group_num in 1:k) {
+    group <- groups[group_num]
+    st_mats[((group_num - 1) * S + 1):(group_num * S), which(z == group)] <- 1 * outer(st_vals, st[z == group], "==")
+  }
+  for (comp in 1:n_comp) {
+    model$A <- rbind(model$A,
+                     cbind(simple_triplet_zero_matrix(nrow = k * S, ncol = (N * (comp - 1))),
+                           st_mats, simple_triplet_zero_matrix(nrow = k * S, ncol = N * (n_comp - comp) + 2 * n_comp * kc2 * nvars)))
+  }
+
+  # Now, if multiple comparisons, add constraint that all a's for a unit add to <= 1
+  # (so that one unit is not chosen for multiple comparisons)
+  if (n_comp > 1) {
+    mat <- do.call(cbind, replicate(n_comp, simple_triplet_diag_matrix(rep(1, N)), simplify=FALSE))
+    model$A <- rbind(model$A, cbind(mat, simple_triplet_zero_matrix(nrow = N, ncol = 2 * n_comp * kc2 * nvars)))
+  }
 
-  # Now, append stratum size constraints
-  model$A <- rbind(model$A, cbind(1 * outer(st_vals, st[z == 0], "=="), matrix(0, S,
-                                                                               2 * nvars)))
   # Constraints for eps are equalities, number of controls per strata are equalities
-  model$sense <- c(rep("==", nvars), rep("==", S))
-  model$rhs <- c(rep(0, nvars), q_s)  # Right hand side of constraints
+  # Constraints for units only counting in one comparison are <=
+  model$sense <- c(rep("==", n_comp * kc2 * nvars), rep("==", n_comp * k * S))
+  if (n_comp > 1) {
+    model$sense <- c(model$sense, rep("<=", N))
+  }
+
+  # right hand side of constraints
+  model$rhs <- rep(0, n_comp * kc2 * nvars)
+  for (comp in 1:n_comp) {
+    model$rhs <- c(model$rhs, ramify::flatten(q_s[[comp]]))
+  }
+  if (n_comp > 1) {
+    model$rhs <- c(model$rhs, rep(1, N))
+  }
 
-  ndecv <- as.integer(N + (2 * nvars))  # number of decision variables
+  ndecv <- as.integer(n_comp * N + (2 * n_comp * kc2 * nvars))  # number of decision variables
+  model$ub <- c(rep(1, n_comp * N), rep(Inf, 2 * n_comp * kc2 * nvars))
   model$lb <- rep(0, ndecv)
-  model$ub <- c(rep(1, N), rep(Inf, 2 * nvars))
   bounds <- list(lower = list(ind = 1:ndecv, val = model$lb),
                  upper = list(ind = 1:ndecv, val = model$ub))
   if (integer) {
-    model$vtype <- c(rep("B", N), rep("C", 2 * nvars))
+    model$vtype <- c(rep("B", n_comp * N), rep("C", 2 * n_comp * kc2 * nvars))
   } else {
     model$vtype <- rep("C", ndecv)
   }
@@ -64,7 +103,8 @@ balance_LP <- function(z, X, importances, st, st_vals, S, q_s, N,
     } else {
       params$TimeLimit <- 0
     }
-    o <- Rglpk::Rglpk_solve_LP(model$obj, model$A, model$sense, model$rhs, bounds = bounds,
+    o <- Rglpk::Rglpk_solve_LP(obj = model$obj, mat = model$A, dir = model$sense,
+                               rhs = model$rhs, bounds = bounds,
                                types = model$vtype, control = list(
                                  canonicalize_status = FALSE, tm_limit = params$TimeLimit))
     if (o$status != 5) {
@@ -75,7 +115,8 @@ balance_LP <- function(z, X, importances, st, st_vals, S, q_s, N,
   }
   if (solver == "gurobi") {
     # Note that for gurobi, all inequalities are interpreted to be "or equal to"
-    model$sense <- c(rep("=", nvars), rep("=", S))
+    model$sense <- c(rep("=", n_comp * kc2 * nvars), rep("=", n_comp * k * S),
+                     rep("<", N))
     o <- gurobi::gurobi(model, params)
     if (o$status != "OPTIMAL") {
       warning("No solution found for the linear program.")
 
@@ -6,8 +6,13 @@
 #' This function can also generate love plots of the same quantities.
 #'
 #' @inheritParams stand
+#' @inheritParams optimize_controls
 #' @param X a data frame containing the covariates in the columns over which balance is desired. The number
 #' of rows should equal the length of \code{z}.
+#' @param treated which treatment value should be considered the treated units. This
+#' must be one of the values of \code{z}.
+#' @param control which treatment value should be considered the control units. This
+#' must be one of the values of \code{z}.
 #' @param selected a boolean vector including whether each unit was selected as part of the treated and control
 #' groups for analysis. Should be the same length as \code{z} and typically comes from the results of
 #' \code{\link{optimize_controls}()}.
@@ -65,40 +70,45 @@
 #'                              selected = results$selected,
 #'                              plot = TRUE)
 
-
-check_balance <- function(z, X, st, selected, denom_variance = "treated", plot = FALSE, message = TRUE) {
+check_balance <- function(z, X, st, selected, treated = 1, control = 0,
+                          denom_variance = "treated", plot = FALSE, message = TRUE) {
 
   if (plot && !requireNamespace("ggplot2", quietly = TRUE) && !requireNamespace("rlang", quietly = TRUE)) {
-      stop("Packages \"ggplot2\" and \"rlang\" needed if \"plot\" argument set to \"TRUE\". Please
+    stop("Packages \"ggplot2\" and \"rlang\" needed if \"plot\" argument set to \"TRUE\". Please
          install these or switch the \"plot\" argument to \"FALSE\".",
-           call. = FALSE)
+         call. = FALSE)
   }
 
   st <- as.factor(st)
   X[, sapply(X, is.logical)] <- sapply(X[, sapply(X, is.logical)], as.numeric)
   dummies <- dummyVars( ~ ., data = X, levelsOnly = FALSE)
   full_X <- predict(dummies, newdata = X)
 
-  sd_across <- get_stand_diffs(full_X, z, selected, denom_variance = denom_variance)
+  sd_across <- get_stand_diffs(full_X, z, selected, treated = treated, control = control,
+                               denom_variance = denom_variance)
 
   sd_strata <- NULL
   for (ist in levels(st)) {
     sd_strata <- rbind(sd_strata, cbind(get_stand_diffs(full_X, z, selected, st, ist,
+                                                        treated = treated, control = control,
                                                         denom_variance = denom_variance), ist))
   }
   colnames(sd_strata)[4] <- "stratum"
 
-  q_s <- sapply(levels(st), function(ist) {sum( !z & selected & st == ist )})
-  n_s <- sapply(levels(st), function(ist) {sum( !z & st == ist )})
+  q_s <- sapply(levels(st), function(ist) {sum( z == control & selected & st == ist )})
+  n_s <- sapply(levels(st), function(ist) {sum( z == control & st == ist )})
 
-  fr_tab <- table(z, st)
   sd_strata_avg <- sd_across
   sd_strata_avg[1:dim(sd_strata_avg)[1], 1:2] <- NA
   for (cov in row.names(sd_strata_avg)) {
     sd_strata_avg[cov, 1] <- sum(sapply(levels(st), function(ist) {
-      sd_strata[sd_strata$covariate == cov & sd_strata$stratum == ist, 1] * (n_s[ist] - sum(is.na(X[!z & st == ist, cov]))) })) / (sum(n_s) - sum(is.na(X[!z, cov])))
+      sd_strata[sd_strata$covariate == cov & sd_strata$stratum == ist, 1] *
+        (n_s[ist] - sum(is.na(X[z == 0 & st == ist, cov]))) })) /
+      (sum(n_s) - sum(is.na(X[z == 0, cov])))
     sd_strata_avg[cov, 2] <- sum(sapply(levels(st), function(ist) {
-      sd_strata[sd_strata$covariate == cov & sd_strata$stratum == ist, 2] * (q_s[ist] - sum(is.na(X[!z & st == ist & selected, cov]))) })) / (sum(q_s) - sum(is.na(X[!z & selected, cov])))
+      sd_strata[sd_strata$covariate == cov & sd_strata$stratum == ist, 2] *
+        (q_s[ist] - sum(is.na(X[z == 0 & st == ist & selected, cov]))) })) /
+      (sum(q_s) - sum(is.na(X[z == 0 & selected, cov])))
   }
 
   if (message) {
@@ -169,41 +179,50 @@ check_balance <- function(z, X, st, selected, denom_variance = "treated", plot =
 #' choosing a subset of controls, and one for after. The rows pertain to covariates.
 #' @keywords internal
 
-get_stand_diffs <- function(data, z, selected, st = NULL, ist = NULL, denom_variance = "treated") {
+get_stand_diffs <- function(data, z, selected, st = NULL, ist = NULL,
+                            treated = 1, control = 0, denom_variance = "treated") {
+  if (is.vector(z)) {
+    z <- as.factor(z)
+  }
   if (!is.null(ist)) {
     ind <- st == ist
   } else {
     ind <- rep(TRUE, length(z))
   }
-  treatedmat_full <- data[z == 1, , drop = FALSE]
-  treatedmat <- data[z == 1 & ind, , drop = FALSE]
   # Standardized differences before matching
-  controlmat_before_full <- data[z == 0, , drop = FALSE]
-  controlmat_before <- data[z == 0 & ind, , drop = FALSE]
+  treatedmat_before_full <- data[z == treated, , drop = FALSE]
+  treatedmat_before <- data[z == treated & ind, , drop = FALSE]
+  treatedmean_before <- apply(treatedmat_before, 2, mean, na.rm = TRUE)
+  controlmat_before_full <- data[z == control, , drop = FALSE]
+  controlmat_before <- data[z == control & ind, , drop = FALSE]
   controlmean_before <- apply(controlmat_before, 2, mean, na.rm = TRUE)
-  treatmean <- apply(treatedmat, 2, mean, na.rm = TRUE)
-  treatvar <- apply(treatedmat_full, 2, var, na.rm = TRUE)
-  controlvar <- apply(controlmat_before_full, 2, var, na.rm = TRUE)
-  if (dim(treatedmat_full)[1] == 1) {
-    treatvar[1:length(treatvar)] <- 0.0
+  variances <- sapply(levels(z), function(group) {
+    return(apply(data[z == group, , drop = FALSE], 2, var, na.rm = TRUE))
+  })
+  if (is.vector(variances)) {
+    variances <- matrix(variances, ncol = 1)
   }
+  variances[is.na(variances)] <- 0
   if (denom_variance == "pooled") {
-    denom <- sqrt((treatvar + controlvar) / 2)
+    denom <- sqrt(rowMeans(variances))
   } else {
-    denom <- sqrt(treatvar)
-    denom[treatvar == 0] <- sqrt(controlvar[treatvar == 0] / 2)
+    denom <- sqrt(variances[, levels(z) == treated])
+    denom[denom == 0] <-
+      sqrt(rowMeans(variances)[denom == 0])
   }
-  stand_diff_before <- rep(NA, length(treatvar))
-  names(stand_diff_before) <- names(treatvar)
-  stand_diff_before <- (treatmean - controlmean_before) / denom
-  stand_diff_before[treatmean == controlmean_before] <- 0.0
+  stand_diff_before <- rep(NA, nrow(variances))
+  names(stand_diff_before) <- dimnames(variances)[[1]]
+  stand_diff_before <- (treatedmean_before - controlmean_before) / denom
+  stand_diff_before[treatedmean_before == controlmean_before] <- 0.0
   # Standardized differences after matching
-  controlmat_after <- data[selected & z == 0 & ind, , drop = FALSE]
+  controlmat_after <- data[selected & z == control & ind, , drop = FALSE]
   controlmean_after <- apply(controlmat_after, 2, mean, na.rm = TRUE)
-  stand_diff_after <- rep(NA, length(treatvar))
-  names(stand_diff_after) <- names(treatvar)
-  stand_diff_after <- (treatmean - controlmean_after) / denom
-  stand_diff_after[treatmean == controlmean_after] <- 0.0
+  treatedmat_after <- data[selected & z == treated & ind, , drop = FALSE]
+  treatedmean_after <- apply(treatedmat_after, 2, mean, na.rm = TRUE)
+  stand_diff_after <- rep(NA, nrow(variances))
+  names(stand_diff_after) <- dimnames(variances)[[1]]
+  stand_diff_after <- (treatedmean_after - controlmean_after) / denom
+  stand_diff_after[treatedmean_after == controlmean_after] <- 0.0
   sd_matrix <- data.frame(abs_stand_diff_before = abs(stand_diff_before),
                           abs_stand_diff_after = abs(stand_diff_after))
   if (!is.null(ist)) {
@@ -329,8 +348,8 @@ plot_stand_diffs <- function(sds, type) {
                                  stratum = sds$sd_strata$stratum)
 
     p <- apply(as.array(unique(sds$sd_strata$stratum)), 1, function(x) {
-        ggplot(plot_dataframe[plot_dataframe$stratum == x,],
-                      aes(x = .data$abs_stand_diff, y = .data$covariates)) +
+      ggplot(plot_dataframe[plot_dataframe$stratum == x,],
+             aes(x = .data$abs_stand_diff, y = .data$covariates)) +
         geom_point(size = 5, aes(shape = .data$type)) +
         scale_shape_manual(values = c(4, 1)) +
         geom_vline(xintercept = c(.1,.2), lty = 2) +