Issue 1641 survey::olr bugs (#1642)

vincentarelbundock · web-flow · commit 629a33c1dca4 · 2025-12-16T12:28:26.000-05:00
* Fix survey comparisons NA filtering and bump version

* survey matching intercept names for ord
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: marginaleffects
 Title: Predictions, Comparisons, Slopes, Marginal Means, and Hypothesis Tests
-Version: 0.31.0.3
+Version: 0.31.0.4
 Authors@R:
     c(person(given = "Vincent",
              family = "Arel-Bundock",
diff --git a/NEWS.md b/NEWS.md
@@ -8,6 +8,8 @@ Bugs:
 
 * Error when merging the original data back into `comparisons()` when the data includes some list columns. The problem is that `data.table` does not support that column type. We now return the original data.table error as a warning, and do not merge the data back. Thanks to @raffaem for report #1638.
 * Improve warning message for hypothesis string order. Thanks to @zakarydraper for report #1640.
+* `comparisons()` with survey-weighted ordinal regressions failed because `stats::na.omit()` discarded every row when auxiliary columns were all `NA`, and the remaining objects fell out of alignment. We now filter using a shared index so hi/lo predictions, weights, and posterior draws stay synchronized.
+* `set_coef.svyolr()` did not recognize thresholds named like `"Intercept: 1|2"`, so delta-method perturbations replaced all cutpoints with `NA` and SEs vanished for `comparisons()`/`avg_*()` on survey ordinal models. Threshold names are now matched with or without the `"Intercept:"` prefix.
 
 ## 0.31.0
 
diff --git a/R/get_comparisons.R b/R/get_comparisons.R
@@ -298,7 +298,9 @@ compare_hi_lo_bayesian <- function(out, draws, draws_hi, draws_lo, draws_or, by,
     # drop missing otherwise get_averages() fails when trying to take a
     # simple mean
     idx_na <- !is.na(out$predicted_lo)
-    out <- stats::na.omit(out, cols = "predicted_lo")
+    # Build a single index and reuse it so `out` and the posterior draws remain aligned
+    # while removing rows with missing `predicted_lo`.
+    out <- out[idx_na]
 
     # TODO: performance is probably terrrrrible here, but splitting is
     # tricky because grouping rows are not always contiguous, and the order
@@ -366,7 +368,11 @@ compare_hi_lo_bayesian <- function(out, draws, draws_hi, draws_lo, draws_or, by,
 
 
 compare_hi_lo_frequentist <- function(out, idx, cross, variables, fun_list, elasticities, newdata) {
-    out <- stats::na.omit(out, cols = "predicted_lo")
+    # When survey weights add all-NA aux columns, stats::na.omit() (which ignores
+    # `cols`) was zeroing out `out`. Build one index and reuse it so downstream
+    # operations rely on the same filtered rows.
+    idx_pred <- !is.na(out$predicted_lo)
+    out <- out[idx_pred]
     # We want to write the "estimate" column in-place because it safer
     # than group-merge; there were several bugs related to this in the past.
     # safefun() returns 1 value and NAs when the function retunrs a
@@ -402,13 +408,16 @@ compare_hi_lo_frequentist <- function(out, idx, cross, variables, fun_list, elas
             out <- subset(out, select = idx)
         }
     }
-    out <- stats::na.omit(out, cols = "estimate")
+    out <- out[!is.na(estimate)]
 
     return(out)
 }
 
 
 compare_hi_lo <- function(hi, lo, y, n, term, cross, wts, tmp_idx, newdata, variables, fun_list, elasticities) {
+    if (n == 0 || length(hi) == 0) {
+        return(numeric(0))
+    }
     tn <- term[1]
     eps <- variables[[tn]]$eps
     # when cross=TRUE, sanitize_comparison enforces a single function
diff --git a/R/methods_survey.R b/R/methods_survey.R
@@ -16,7 +16,15 @@ set_coef.svyolr <- function(model, coefs, ...) {
     # in basic model classes coefficients are named vector
     idx <- match(names(model$coefficients), names(coefs))
     model[["coefficients"]] <- coefs[idx]
-    idx <- match(names(model$zeta), names(coefs))
+
+    # thresholds can be named either "1|2" or "Intercept: 1|2"
+    zeta_names <- names(model$zeta)
+    idx <- match(zeta_names, names(coefs))
+    if (anyNA(idx)) {
+        zeta_alt <- paste("Intercept:", zeta_names)
+        idx_alt <- match(zeta_alt, names(coefs))
+        idx <- ifelse(is.na(idx), idx_alt, idx)
+    }
     model[["zeta"]] <- coefs[idx]
     model
 }
diff --git a/R/sanitize_newdata.R b/R/sanitize_newdata.R
@@ -184,7 +184,7 @@ add_wts_column <- function(wts, newdata, model) {
     flag2 <- isTRUE(checkmate::check_numeric(wts, len = nrow(newdata)))
     if (!flag1 && !flag2) {
         msg <- sprintf(
-            "The `wts` argument must be a numeric vector of length %s, or a string which matches a column name in `newdata`. If you did not supply a `newdata` explicitly, `marginaleffects` extracted it automatically from the model object, and the `wts` variable may not have been available. The easiest strategy is often to supply a data frame such as the original data to `newdata` explicitly, and to make sure that it includes an appropriate column of weights, identified by the `wts` argument.",
+            "The `wts` argument must be a numeric vector of length %s, or a string which matches one of the `colnames()` in the data frame that you supplied to the `newdata`, or in the `marginaleffects` objects.",
             nrow(newdata)
         )
         stop(msg, call. = FALSE)
diff --git a/inst/tinytest/test-pkg-survey.R b/inst/tinytest/test-pkg-survey.R
@@ -58,3 +58,23 @@ m <- suppressWarnings(svyglm(
 ))
 cmp <- avg_comparisons(m, variables = "education", by = c("ban", "gender"), wts = "weights", hypothesis = ~reference)
 expect_false(anyNA(cmp$estimate))
+
+# svyolr delta-method standard errors
+set.seed(1234)
+n <- 400
+z <- rbinom(n, 1, 0.5)
+x <- factor(sample(c("1", "2", "3"), n, replace = TRUE))
+beta_x <- c("1" = 0, "2" = 0.4, "3" = -0.3)
+beta_z <- 0.6
+eta <- beta_x[x] + beta_z * z + rlogis(n)
+cuts <- c(-1.5, -0.5, 0.5, 1.5)
+y <- cut(eta, breaks = c(-Inf, cuts, Inf), labels = 1:5, ordered_result = TRUE)
+weights <- rlnorm(n, meanlog = log(50000), sdlog = 1)
+design <- svydesign(ids = ~1, weights = ~weights, data = data.frame(y, x, z, weights))
+ord_svy <- svyolr(y ~ x + z, method = "logistic", design = design)
+cmp <- comparisons(ord_svy, wts = "(weights)")
+expect_true(any(!is.na(cmp$std.error)))
+avg_cmp <- avg_comparisons(ord_svy, wts = "(weights)")
+expect_true(any(!is.na(avg_cmp$std.error)))
+avg_pred <- avg_predictions(ord_svy, wts = "(weights)")
+expect_true(any(!is.na(avg_pred$std.error)))

Original file line number	Diff line number	Diff line change
`@@ -184,7 +184,7 @@ add_wts_column <- function(wts, newdata, model) {`
`184`	`184`	`flag2 <- isTRUE(checkmate::check_numeric(wts, len = nrow(newdata)))`
`185`	`185`	`if (!flag1 && !flag2) {`
`186`	`186`	`msg <- sprintf(`
`187`		- "The `wts` argument must be a numeric vector of length %s, or a string which matches a column name in `newdata`. If you did not supply a `newdata` explicitly, `marginaleffects` extracted it automatically from the model object, and the `wts` variable may not have been available. The easiest strategy is often to supply a data frame such as the original data to `newdata` explicitly, and to make sure that it includes an appropriate column of weights, identified by the `wts` argument.",
	`187`	+ "The `wts` argument must be a numeric vector of length %s, or a string which matches one of the `colnames()` in the data frame that you supplied to the `newdata`, or in the `marginaleffects` objects.",
`188`	`188`	`nrow(newdata)`
`189`	`189`	`)`
`190`	`190`	`stop(msg, call. = FALSE)`