zellerlab · microbiomix · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026 · Mar 29, 2026
diff --git a/R/apply_test.R b/R/apply_test.R
@@ -55,13 +55,17 @@ apply.test <- function(sim.location, group, type='default',
 
   for (g in names(all)){
 
+    # skip metadata entry if present
+    if (g == "meta") next
+
     # reconstruct abundance table
     feat.sim <- all[[g]]$features
     markers <- all[[g]]$markers
     test.idx <- all[[g]]$idx
     log.n0 <- all[[g]]$log.n0
     conf.l <- all[[g]]$conf
 
+    # Build full confounder matrix for this group
     if (!is.null(meta.all) & !is.null(conf.l)){
       conf.mat <- cbind(meta.all, conf.l)
       colnames(conf.mat)[ncol(conf.mat)] <- 'conf'
@@ -155,7 +159,7 @@ check.testing.parameters <- function(sim.location, group, type,
                         'ZIBSeq-sqrt',
                         'ANCOM',
                         'ANCOM_old',
-                        'ANCOMBC',
+                        'ANCOMBC', 'ANCOMBC2',
                         "lm", "lm_inter",
                         "corncob",
                         "limma",
@@ -327,7 +331,7 @@ check.testing.parameters <- function(sim.location, group, type,
     }
 
     # check if rep_no in the additional arguments
-    if (length(additional.arguments > 0)) {
+    if (length(additional.arguments) > 0) {
       if ('rep_no' %in% names(additional.arguments)){
         if (length(subset) > 1){
           warning("Parameter 'rep_no' will be ignored, since more",
@@ -336,7 +340,7 @@ check.testing.parameters <- function(sim.location, group, type,
         } else {
           for (a in names(idx.all)){
             idx.all[[a]] <- idx.all[[a]][[
-              paste0('rep', additional.arguments$rep.no), , drop=FALSE]]
+              paste0('rep', additional.arguments$rep_no), , drop=FALSE]]
           }
         }
       }
@@ -355,7 +359,7 @@ check.testing.parameters <- function(sim.location, group, type,
 
     all[[g]]$markers <- complete.group$marker_idx
     if (!is.null(conf) & any(conf!='conf')){
-      all[[meta]] <- h5read(sim.location, 'original_data/metadata')
+      all[["meta"]] <- h5read(sim.location, 'original_data/metadata')
     }
   }
   return(all)

diff --git a/R/create_test_idx.R b/R/create_test_idx.R
@@ -141,7 +141,7 @@ save.idx <- function(ab, rep, subs, reps, class.balance, sim.file,
                        sample(neg.samples, s*class.balance,
                               replace = TRUE)) }
       else {
-        sub.label <- c(sample(pos.samples, ceiling(s*(1-class.balance))),
+        sub.label <- c(sample(pos.samples, s-ceiling(s*class.balance)),
                        sample(neg.samples, ceiling(s*class.balance))) }
       stopifnot(length(sub.label) >= s)
       sub.label <- sub.label[seq(s)]

diff --git a/R/eval_test.R b/R/eval_test.R
@@ -66,7 +66,9 @@ eval.test <- function(sim.location, group, res.mat, adjust='BH', alpha = 0.05) {
 
     # named vector of p.vals for test rep x
     p.val <- res[,x]
-    p.val <- p.adjust(p.val, method=adjust)
+    if (!adjust %in% c("PASS", "pass")) {
+      p.val <- p.adjust(p.val, method=adjust)
+    }
     # auc
     auroc <- roc(predictor = -log10(p.val + 1e-50), response = marker,
                  levels = c(0, 1), direction = '<')

diff --git a/R/helper_clean.R b/R/helper_clean.R
@@ -50,7 +50,7 @@ check.simulation.location <- function(sim.out){
 
 # check input data
 #' @keywords internal
-check.original.data <- function(feat, meta, sim.type, sim.method){
+check.original.data <- function(feat, meta, sim.type, sim.method, factorize.metadata = TRUE){
 
   # check that feat and meta make sense
   # check the class of the features
@@ -108,7 +108,7 @@ check.original.data <- function(feat, meta, sim.type, sim.method){
   }
 
   # validate original data
-  data.list <- validate.original.data(original.data, sim.method)
+  data.list <- validate.original.data(original.data, sim.method, factorize.metadata)
   feat <- data.list$feat
   meta <- data.list$meta
   # remove repeated measurements/individuals with single samples
@@ -172,7 +172,7 @@ check.original.data <- function(feat, meta, sim.type, sim.method){
 
 # validate original data
 #' @keywords internal
-validate.original.data <- function(d.list, sim.method){
+validate.original.data <- function(d.list, sim.method, factorize.metadata = TRUE){
 
   feat.final <- list()
   meta.final <- list()
@@ -278,7 +278,9 @@ validate.original.data <- function(d.list, sim.method){
     meta.final[[j]] <- meta.final[[j]][,colnames.final]
   }
   meta.all <- do.call(rbind, meta.final)
-  meta.all <- clean.meta.data(meta.all)
+  if (factorize.metadata) {
+    meta.all <- clean.meta.data(meta.all)
+  }
   feat.all <- feat.all[,rownames(meta.all)]
   rownames(feat.all) <- paste0('bact_otu_', seq_len(nrow(feat.all)))
   return(list(feat=feat.all, meta=meta.all))
@@ -350,7 +352,7 @@ clean.meta.data <- function(df.meta){
   }
 }
 
-# check simulation parameters
+# check filtering parameters
 #' @keywords internal
 check.filtering.parameters <- function(filt.params){
 
@@ -567,7 +569,7 @@ check.common.simulation.parameters <- function(sim.params, meta, sim.method){
       stop("Parameter 'class.balance' should be numeric")
     } else if (class.balance > 0.85){
       stop("Parameter 'class.balance' seems too high (", class.balance, ")")
-    } else if (class.balance < 0.45){
+    } else if (class.balance < 0.15){
       stop("Parameter 'class.balance' seems too low (", class.balance, ")")
     }
   }

diff --git a/R/helper_resampling.R b/R/helper_resampling.R
@@ -25,6 +25,11 @@ simulate.resampling <- function(feat, meta, sim.out, sim.params){
     balanced <- TRUE
   }
 
+  # optional fixed-label mode via 'fixed.labels=TRUE' in sim.params
+  fixed.labels <- isTRUE(sim.params$fixed.labels)
+  label.col    <- if ('label.col' %in% names(sim.params)) sim.params$label.col else NULL
+  case.value   <- if ('case.value' %in% names(sim.params)) sim.params$case.value else NULL
+  control.value <- if ('control.value' %in% names(sim.params)) sim.params$control.value else NULL
 
   if (conf == 'None'){
     conf <- NULL
@@ -107,6 +112,56 @@ simulate.resampling <- function(feat, meta, sim.out, sim.params){
     }
   }
 
+  ## prepare a fixed label vector once if requested
+  if (fixed.labels){
+    stopifnot(!is.null(label.col))
+    stopifnot(label.col %in% colnames(meta))
+
+    raw.label <- meta[, label.col]
+    names(raw.label) <- rownames(meta)
+
+    keep <- !is.na(raw.label)
+    raw.label <- raw.label[keep]
+
+    if (!is.null(prob)){
+      common.samples <- intersect(names(prob), names(raw.label))
+      raw.label <- raw.label[common.samples]
+      prob <- prob[common.samples]
+      if (!is.null(conf) && exists("conf.label")){
+        conf.label <- conf.label[common.samples]
+      }
+    }
+
+    ## recode to -1 / +1
+    if (!is.null(case.value) && !is.null(control.value)){
+      label.fixed <- ifelse(raw.label == case.value, 1,
+                            ifelse(raw.label == control.value, -1, NA))
+    } else {
+      u <- unique(raw.label)
+      u <- u[!is.na(u)]
+      stopifnot(length(u) == 2)
+      label.fixed <- ifelse(raw.label == u[1], -1, 1)
+      warning("fixed.labels=TRUE but case/control values not supplied; ",
+              "mapped first observed label to -1 and second to +1.")
+    }
+
+    keep2 <- !is.na(label.fixed)
+    label.fixed <- label.fixed[keep2]
+    names(label.fixed) <- names(raw.label)[keep2]
+
+    stopifnot(all(label.fixed %in% c(-1, 1)))
+    num.sample <- length(label.fixed)
+
+    ## consistency check
+    observed.balance <- mean(label.fixed == 1)
+    if (!is.null(class.balance) && abs(observed.balance - class.balance) > 1e-3){
+      message("++ fixed.labels=TRUE: ignoring sim.params$class.balance = ",
+              class.balance,
+              " and using observed balance = ",
+              round(observed.balance, 4))
+    }
+  }
+
   # actual implantation
   pb <- progress_bar$new(total = length(ab.scale)*length(prev.scale)*repeats)
   for (a in seq_along(ab.scale)){
@@ -145,14 +200,28 @@ simulate.resampling <- function(feat, meta, sim.out, sim.params){
         }
 
         # generate label
-        label <- rep(-1, num.sample)
-        s <- sample(num.sample,
-                    size=round(num.sample*class.balance),
-                    prob=prob+1e-20)
-        names(label) <- names(prob)
-        label[s] <- +1 # switch random half of the samples to positive class
-
-        feat.tmp <- feat[,names(label)]
+        if (isTRUE(sim.params$fixed.labels)) {
+          label <- meta[[sim.params$label.col]]
+          names(label) <- rownames(meta)
+
+          # recode to -1 / +1
+          label <- ifelse(label == sim.params$case.value, 1, -1)
+
+          # keep only samples with non-missing labels
+          keep <- !is.na(label)
+          label <- label[keep]
+          feat.tmp <- feat[, names(label), drop = FALSE]
+          num.sample <- length(label)
+        } else {
+          label <- rep(-1, num.sample)
+          s <- sample(num.sample,
+                      size=round(num.sample*class.balance),
+                      prob=prob+1e-20)
+          names(label) <- names(prob)
+          label[s] <- +1 # switch random half of the samples to positive class
+
+          feat.tmp <- feat[,names(label)]
+        }
 
         # sample markers
         marker.idx <- sample(el.feat.names,

diff --git a/R/helper_tests.R b/R/helper_tests.R
@@ -50,6 +50,8 @@ run.test <- function(data, label, test, conf){
     p.val <- test.ANCOM(data=data, label=label, conf=conf)
   } else if (test == 'ANCOMBC'){
     p.val <- test.ANCOMBC(data=data, label=label, conf=conf)
+  } else if (test == 'ANCOMBC2'){
+    p.val <- test.ANCOMBC2(data=data, label=label, conf=conf)
   } else if (test=='corncob'){
     p.val <- test.corncob(data=data, label=label, conf=conf)
   } else if (test=='limma'){
@@ -513,11 +515,48 @@ test.ANCOMBC <- function(data, label, conf){
   x.phylo <- phyloseq::phyloseq(
     otu_table = phyloseq::otu_table(data, taxa_are_rows = TRUE),
     sample_data = phyloseq::sample_data(s.data))
-  temp <- ANCOMBC::ancombc(phyloseq = x.phylo, formula = f.form,
+  temp <- ANCOMBC::ancombc(data = x.phylo, formula = f.form,
                   p_adj_method = 'fdr', lib_cut = 100)
   p.val <- rep(1, nrow(data))
   names(p.val) <- rownames(data)
-  p.val[rownames(temp$res$q_val)] <- temp$res$p_val$label
+  p.val[temp$res$p_val$taxon] <- temp$res$p_val$label
+  return(p.val)
+}
+
+#' @keywords internal
+test.ANCOMBC2 <- function(data, label, conf){
+  #devtools::load_all("/projects/arumugam/people/mjq180/git/ANCOMBC")
+  test.package('ANCOMBC')
+  test.package("phyloseq")
+  # deal with nonunique samples from biased idx generation
+  names(label) <- paste0(names(label), '_', seq_along(label))
+  colnames(data) <- paste0(colnames(data), '_', seq_along(label))
+  label <- label + 1
+  label <- label/2
+  if (is.null(conf)){
+    s.data <- data.frame(label=label, dummy=2)
+    f.form <- 'label'
+  } else {
+    s.data <- cbind(data.frame(label=label, dummy=2), conf)
+    f.form <- paste0('label+', paste(colnames(conf), collapse = '+'))
+  }
+  x.phylo <- phyloseq::phyloseq(
+    otu_table = phyloseq::otu_table(data, taxa_are_rows = TRUE),
+    sample_data = phyloseq::sample_data(s.data))
+  temp <- ancombc2(data = x.phylo,
+                            fix_formula = f.form,
+                            rand_formula = NULL,
+                            p_adj_method = "fdr",
+                            prv_cut = 0.1,
+                            lib_cut = 1000,
+                            group = "label",
+                            struc_zero = TRUE,
+                            neg_lb = TRUE,
+                            alpha = 0.05
+                            )
+  p.val <- rep(1, nrow(data))
+  names(p.val) <- rownames(data)
+  p.val[temp$res$taxon] <- temp$res$p_label
   return(p.val)
 }
 

diff --git a/R/simulate_data.R b/R/simulate_data.R
@@ -73,7 +73,13 @@ create.data.simulation <- function(feat, meta, sim.location,
 
   # check data
   message("+ Start checking data")
-  res <- check.original.data(feat, meta, sim.type, sim.method)
+  factorize.metadata <- TRUE
+  # Only skip factorizing if explicitly requested to skip it!
+  if (!is.null(sim.params[["factorize.metadata"]]) && (sim.params[["factorize.metadata"]] == FALSE)) {
+     factorize.metadata <- FALSE
+  }
+  message("++ Factorize metadata columns: ", factorize.metadata)
+  res <- check.original.data(feat, meta, sim.type, sim.method, factorize.metadata = factorize.metadata)
   message("+ Finished checking data")
   message("")
   feat.original <- res$feat