Skip to content

Commit d9dc7f1

Browse files
delfiterradaspinin4fjordsclaude
authored
Add validation for formula-based contrasts (#90)
* Accept spaces in variables in metadata * Add validation against matrix for `make_contrsts_str` * Revert "Accept spaces in variables in metadata" This reverts commit 9f8be32. * Use reformulas::nobars() for robust fixed-effects extraction and make formula-based validation check formula variables, and full rank consistently * Modify failing tests to get a full rank model * Keep exclude_... params in yaml validation and commit suggestions * Accept pre-computed model_matrix * Add test with expected error for failed contrast validation * Fix test * Fix expected error --------- Co-authored-by: Jonathan Manning <jonathan.manning@seqera.io> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 6742562 commit d9dc7f1

3 files changed

Lines changed: 132 additions & 20 deletions

File tree

DESCRIPTION

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ Imports:
2525
plotly (>= 4.3.4),
2626
plyr,
2727
RColorBrewer,
28+
reformulas,
2829
reshape2,
2930
scatterplot3d,
3031
shiny,

R/accessory.R

Lines changed: 90 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -885,6 +885,61 @@ checkListIsSubset <- function(test_list,
885885
TRUE
886886
}
887887

888+
#' Remove random effects from a model formula
889+
#'
890+
#' @param formula_string Formula string that may contain random effects
891+
#'
892+
#' @return output Fixed-effects-only formula
893+
fixedEffectsFormula <- function(formula_string) {
894+
reformulas::nobars(as.formula(formula_string))
895+
}
896+
897+
#' Build a model matrix from the fixed-effects part of a formula
898+
#'
899+
#' @param formula_string Formula string that may contain random effects
900+
#' @param samples Data frame of sample information
901+
#'
902+
#' @return output Model matrix for the fixed-effects formula
903+
fixedEffectsModelMatrix <- function(formula_string, samples) {
904+
model.matrix(fixedEffectsFormula(formula_string), data = samples)
905+
}
906+
907+
#' Validate a formula-based contrast string against fixed-effect coefficients
908+
#'
909+
#' @param contrast_id Contrast identifier
910+
#' @param contrast_formula Formula string used for the contrast
911+
#' @param contrast_string Contrast string to validate
912+
#' @param model_matrix Optional precomputed model matrix for the fixed-effects formula
913+
#' @param samples Data frame of sample information
914+
#'
915+
#' @return output Returns TRUE if validation passes
916+
validateFormulaBasedContrast <- function(contrast_id,
917+
contrast_formula,
918+
contrast_string,
919+
model_matrix = NULL,
920+
samples) {
921+
if (is.null(model_matrix)) {
922+
model_matrix <- fixedEffectsModelMatrix(contrast_formula, samples)
923+
}
924+
model_coefficients <- make.names(colnames(model_matrix), unique = TRUE)
925+
926+
tryCatch(
927+
limma::makeContrasts(contrasts = contrast_string, levels = model_coefficients),
928+
error = function(e) {
929+
stop(
930+
paste0(
931+
"Contrast id '", contrast_id, "' has invalid make_contrasts_str '", contrast_string,
932+
"' for formula '", contrast_formula, "'. ",
933+
"Available coefficient names for make_contrasts_str: ",
934+
paste(model_coefficients, collapse = ", "),
935+
"."
936+
),
937+
call. = FALSE
938+
)
939+
}
940+
)
941+
}
942+
888943

889944
#' Read and validate a contrasts file against sample metadata
890945
#'
@@ -958,6 +1013,8 @@ read_contrasts <-
9581013
id = x$id,
9591014
variable = NA, reference = NA, target = NA,
9601015
blocking = NA,
1016+
exclude_samples_col = NA,
1017+
exclude_samples_values = NA,
9611018
formula = NA,
9621019
make_contrasts_str = NA,
9631020
stringsAsFactors = FALSE
@@ -1000,6 +1057,15 @@ read_contrasts <-
10001057
stop(sprintf("Contrast id '%s' must provide either 'comparison' or 'formula' + 'make_contrasts_str'.", x$id))
10011058
}
10021059

1060+
if (!is.null(x$exclude_samples_col) || !is.null(x$exclude_samples_values)) {
1061+
if (is.null(x$exclude_samples_col) || is.null(x$exclude_samples_values)) {
1062+
stop(sprintf("Contrast id '%s' must provide both 'exclude_samples_col' and 'exclude_samples_values'.", x$id))
1063+
}
1064+
1065+
row$exclude_samples_col <- x$exclude_samples_col
1066+
row$exclude_samples_values <- paste(x$exclude_samples_values, collapse = ";")
1067+
}
1068+
10031069
row
10041070
}))
10051071
if (any(duplicated(contrasts$id))) {
@@ -1020,21 +1086,21 @@ read_contrasts <-
10201086
if (length(blocking) > 0) {
10211087
success <- checkListIsSubset(blocking, colnames(samples), "blocking variables", "sample metadata")
10221088
}
1089+
if ("exclude_samples_col" %in% colnames(contrasts)) {
1090+
exclude_cols <- na.omit(contrasts$exclude_samples_col)
1091+
if (length(exclude_cols) > 0) {
1092+
success <- checkListIsSubset(exclude_cols, colnames(samples), "exclude sample columns", "sample metadata")
1093+
}
1094+
}
10231095

10241096
# Ensure reference and target are valid for their variable
10251097
for (i in 1:nrow(contrasts)) {
10261098
blocking_vars <- simpleSplit(contrasts[[blocking_column]][i], ";")
1099+
design_cols <- character(0)
10271100

10281101
# Extract design matrix columns from contrasts: the variable column plus any blocking factors.
10291102
# For formula-based contrasts, extract variables from the formula itself.
1030-
formula_vars <- character(0)
1031-
if ("formula" %in% colnames(contrasts) && !is.na(contrasts$formula[i])) {
1032-
formula_vars <- all.vars(as.formula(contrasts$formula[i]))
1033-
}
1034-
10351103
if (validate_design) {
1036-
design_cols <- unique(na.omit(c(contrasts[[variable_column]][i], blocking_vars, formula_vars)))
1037-
10381104
# Filter samples if exclude columns are specified for this contrast
10391105
contrast_samples <- samples
10401106
if ("exclude_samples_col" %in% colnames(contrasts) && "exclude_samples_values" %in% colnames(contrasts)) {
@@ -1045,6 +1111,23 @@ read_contrasts <-
10451111
}
10461112
}
10471113

1114+
if ("formula" %in% colnames(contrasts) && !is.na(contrasts$formula[i])) {
1115+
design_cols <- unique(all.vars(as.formula(contrasts$formula[i])))
1116+
success <- checkListIsSubset(design_cols, colnames(samples), "formula variables", "sample metadata")
1117+
mm <- fixedEffectsModelMatrix(contrasts$formula[i], contrast_samples)
1118+
1119+
validateFormulaBasedContrast(
1120+
contrast_id = contrasts[i, "id"],
1121+
contrast_formula = contrasts$formula[i],
1122+
contrast_string = contrasts$make_contrasts_str[i],
1123+
model_matrix = mm,
1124+
samples = contrast_samples
1125+
)
1126+
} else {
1127+
design_cols <- unique(na.omit(c(contrasts[[variable_column]][i], blocking_vars)))
1128+
mm <- model.matrix(~ . - 1, data = contrast_samples[, design_cols, drop = FALSE])
1129+
}
1130+
10481131
design_matrix <- contrast_samples[, design_cols, drop = FALSE]
10491132

10501133
# Ensure there are no NA values in the design matrix.
@@ -1053,7 +1136,6 @@ read_contrasts <-
10531136
}
10541137

10551138
# Check that the design matrix is full rank.
1056-
mm <- model.matrix(~ . - 1, data = design_matrix)
10571139
if (qr(mm)$rank < ncol(mm)) {
10581140
stop(paste("Design matrix is not full rank.", "Model matrix columns:", paste(colnames(mm), collapse = ", "), "\n"))
10591141
}
@@ -1486,4 +1568,3 @@ cond_log2_transform_assays <- function(assay_data, log2_assays, threshold = 30,
14861568

14871569
return(assay_data)
14881570
}
1489-

tests/testthat/test-accessory.R

Lines changed: 41 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,11 @@ test_that("nlines works", {
3434

3535
test_that("read_contrasts parses YAML correctly", {
3636
samples <- data.frame(
37-
sample = c("Sample1", "Sample7", "Sample13", "Sample19", "Sample16"),
38-
genotype = c("WT", "WT", "KO", "KO", "KO"),
39-
treatment = c("Control", "Treated", "Control", "Treated", "Control"),
40-
time = c(1, 1, 1, 1, 16),
41-
batch = c("b1", "b1", "b1", "b1", "b3"),
37+
sample = c("Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6", "Sample7", "Sample8"),
38+
genotype = c("WT", "WT", "WT", "WT", "KO", "KO", "KO", "KO"),
39+
treatment = c("Control", "Control", "Treated", "Treated", "Control", "Control", "Treated", "Treated"),
40+
time = c(1, 16, 1, 16, 1, 16, 1, 16),
41+
batch = c("b1", "b2", "b1", "b2", "b1", "b2", "b1", "b2"),
4242
stringsAsFactors = FALSE
4343
)
4444

@@ -88,11 +88,11 @@ contrasts:
8888

8989
test_that("read_contrasts parses YAML correctly using only formula based contrasts", {
9090
samples <- data.frame(
91-
sample = c("Sample1", "Sample7", "Sample13", "Sample19", "Sample16"),
92-
genotype = c("WT", "WT", "KO", "KO", "KO"),
93-
treatment = c("Control", "Treated", "Control", "Treated", "Control"),
94-
time = c(1, 1, 1, 1, 16),
95-
batch = c("b1", "b1", "b1", "b1", "b3"),
91+
sample = c("Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6", "Sample7", "Sample8"),
92+
genotype = c("WT", "WT", "WT", "WT", "KO", "KO", "KO", "KO"),
93+
treatment = c("Control", "Control", "Treated", "Treated", "Control", "Control", "Treated", "Treated"),
94+
time = c(1, 16, 1, 16, 1, 16, 1, 16),
95+
batch = c("b1", "b2", "b1", "b2", "b1", "b2", "b1", "b2"),
9696
stringsAsFactors = FALSE
9797
)
9898

@@ -125,4 +125,34 @@ contrasts:
125125
expect_equal(contrasts$make_contrasts_str[3], "genotypeWT.treatmentTreated.time")
126126

127127
unlink(yaml_file)
128-
})
128+
})
129+
130+
test_that("read_contrasts reports a descriptive error for realistic invalid formula contrast strings", {
131+
samples <- data.frame(
132+
sample = c("Sample1", "Sample2", "Sample3", "Sample4", "Sample5", "Sample6", "Sample7", "Sample8"),
133+
genotype = c("WT", "WT", "WT", "WT", "KO", "KO", "KO", "KO"),
134+
treatment = c("Control", "Control", "Treated", "Treated", "Control", "Control", "Treated", "Treated"),
135+
stringsAsFactors = FALSE
136+
)
137+
138+
yaml_content <- "
139+
contrasts:
140+
- id: invalid_formula_contrast
141+
formula: \"~ genotype * treatment\"
142+
make_contrasts_str: \"genotypeWT.treatmenttreated\"
143+
"
144+
145+
yaml_file <- tempfile(fileext = ".yaml")
146+
writeLines(yaml_content, yaml_file)
147+
148+
expect_error(
149+
read_contrasts(yaml_file, samples),
150+
paste(
151+
"Contrast id 'invalid_formula_contrast' has invalid make_contrasts_str 'genotypeWT.treatmenttreated'",
152+
"for formula '~ genotype \\* treatment'. Available coefficient names for make_contrasts_str:",
153+
"X\\.Intercept\\., genotypeWT, treatmentTreated, genotypeWT\\.treatmentTreated\\."
154+
)
155+
)
156+
157+
unlink(yaml_file)
158+
})

0 commit comments

Comments
 (0)