d-morrison · d-morrison · Dec 23, 2025 · Dec 23, 2025 · Dec 23, 2025 · Dec 23, 2025
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
@@ -221,3 +221,115 @@ The package typically works with two types of data frames:
 - GitHub: https://github.com/d-morrison/rwicc
 - Contact: demorrison@ucdavis.edu
 - Reference paper: Morrison et al. (2021), Biometrics. https://doi.org/10.1111/biom.13472
+
+### dplyr Grouping Operations
+
+**Prefer per-operation grouping with `.by` over `group_by()`** where reasonable.
+
+Use the `.by` argument for per-operation grouping instead of `group_by()` + `ungroup()` pattern.
+**The `.by` parameter uses tidy selection. Use quoted strings to avoid R CMD check warnings.**
+
+**✅ Preferred:**
+```r
+data |>
+  dplyr::summarize(
+    .by = c("ID", "Group"),
+    mean_value = mean(.data$value)
+  )
+
+data |>
+  dplyr::mutate(
+    .by = "ID",
+    centered = .data$value - mean(.data$value)
+  )
+```
+
+**❌ Avoid:**
+```r
+# Old pattern with group_by/ungroup
+data |>
+  dplyr::group_by(.data$ID, .data$Group) |>
+  dplyr::summarize(
+    .groups = "drop",
+    mean_value = mean(.data$value)
+  )
+
+# Using .data$ in .by (incorrect - .by uses tidy selection, not data masking)
+data |>
+  dplyr::summarize(
+    .by = c(.data$ID, .data$Group),  # Wrong!
+    mean_value = mean(.data$value)
+  )
+
+# Bare names in .by (works but may trigger R CMD check warnings)
+data |>
+  dplyr::summarize(
+    .by = c(ID, Group),  # Prefer quoted strings
+    mean_value = mean(.data$value)
+  )
+```
+
+**Reference:** https://dplyr.tidyverse.org/reference/dplyr_by.html
+
+**When to use `.by`:**
+- Single operation that needs grouping (summarize, mutate, filter, slice, etc.)
+- When you would immediately ungroup after the operation
+- When the grouping is only relevant to one step in the pipeline
+
+**When `group_by()` may still be appropriate:**
+- Multiple sequential operations need the same grouping
+- When you need to preserve grouping structure for downstream operations
+- When using functions that don't yet support `.by`
+
+### Tidy Selection vs. Data Masking
+
+Understanding the difference between tidy selection and data masking is crucial for writing correct dplyr code.
+
+**Tidy Selection:**
+- Used in: `select()`, `rename()`, `relocate()`, `across()`, **`.by` parameter**
+- Purpose: Select columns by name, position, or pattern
+- Syntax: Use **quoted strings** (e.g., `"ID"`, `"Group"`) or bare names (e.g., `ID`, `Group`)
+- **Prefer quoted strings** to avoid R CMD check warnings about undefined global variables
+- Cannot use `.data$` pronoun (it's not needed and is incorrect)
+- Can use selection helpers: `starts_with()`, `ends_with()`, `contains()`, `where()`, etc.
+
+```r
+# Tidy selection examples (prefer quoted strings)
+data |> dplyr::select("ID", "Group", starts_with("var"))
+data |> dplyr::summarize(.by = c("ID", "Group"), mean = mean(.data$value))
+```
+
+**Data Masking:**
+- Used in: `mutate()`, `filter()`, `summarize()`, `arrange()` (within the expressions)
+- Purpose: Compute on column values
+- Syntax: Use **`.data$` pronoun** for unambiguous column references
+- Helps avoid R CMD check warnings about undefined global variables
+- Makes code more explicit and prevents conflicts with function arguments
+
+```r
+# Data masking examples
+data |> dplyr::mutate(new_col = .data$old_col * 2)
+data |> dplyr::filter(.data$status == "active")
+data |> dplyr::summarize(mean_val = mean(.data$value))
+```
+
+**Key Distinction:**
+- `.by` uses **tidy selection** → quoted strings: `.by = c("ID", "Group")`
+- Expression arguments use **data masking** → `.data$` pronoun: `mean(.data$value)`
+
+**Reference:** https://dplyr.tidyverse.org/articles/programming.html
+
+### Non-Standard Evaluation
+
+Always use `.data$` pronoun for column references in dplyr and ggplot2 functions to avoid R CMD check notes about global variables.
+
+**Examples:**
+```r
+# In dplyr (data masking contexts)
+dplyr::mutate(new_col = .data$old_col * 2)
+dplyr::filter(.data$status == "active")
+
+# In ggplot2
+ggplot2::aes(x = .data$time, y = .data$value)
+```
+
diff --git a/R/fit_joint_model.R b/R/fit_joint_model.R
@@ -80,7 +80,7 @@
 # ==============================================================================
 
 #' @importFrom biglm bigglm
-#' @importFrom dplyr group_by summarize n select left_join filter semi_join mutate ungroup any_of if_else lag all_of group_by_at
+#' @importFrom dplyr summarize n select left_join filter semi_join mutate any_of if_else lag all_of
 #' @importFrom lubridate ddays
 #' @importFrom stats binomial coef predict glm quasibinomial
 #' @importFrom lobstr mem_used
@@ -148,8 +148,10 @@ fit_joint_model <- function(
     {
       E_L_combinations <-
         participant_level_data |>
-        dplyr::group_by(.data$Stratum, .data$E, .data$L) |>
-        dplyr::summarize(.groups = "drop", n_IDs = dplyr::n())
+        dplyr::summarize(
+          .by = c("Stratum", "E", "L"),
+          n_IDs = dplyr::n()
+        )
     }
 
     # identify the set of possible seroconversion dates
@@ -192,14 +194,12 @@ fit_joint_model <- function(
           E_L_combinations,
           by = "Stratum"
         ) |>
-        dplyr::group_by(.data$Stratum, .data$S) |>
         dplyr::summarize(
-          .groups = "drop",
+          .by = c("Stratum", "S"),
           "n_definitely_at_risk" =
             denom_offset +
               sum(.data$n_IDs[.data$E <= .data$S & .data$S < .data$L])
-        ) |>
-        dplyr::ungroup()
+        )
     }
   }
 
@@ -221,9 +221,8 @@ fit_joint_model <- function(
     {
       est_hazard_by_stratum <-
         participant_level_data |>
-        dplyr::group_by(.data$Stratum) |>
         dplyr::summarize(
-          .groups = "drop",
+          .by = "Stratum",
           "P(S=s|S>=s,E=e)" = 1 - exp(-lubridate::ddays(bin_width) / mean(.data$`S_hat - E`))
         ) |>
         # this formula actually computes P(S in [s,s+bin_width]|S>=s), from the
@@ -286,9 +285,8 @@ fit_joint_model <- function(
   {
     observed_data_log_likelihood <- function(subj_level_possible_data) {
       log_L <- subj_level_possible_data |>
-        dplyr::group_by(.data$ID) |>
         dplyr::summarize(
-          .groups = "drop",
+          .by = "ID",
           "logL_i" = log(sum(.data$`P(Y=y|T=t)` * .data$`P(S=s|E=e)`))
         ) |>
         dplyr::summarize(
@@ -323,9 +321,8 @@ fit_joint_model <- function(
             omega_hat |> dplyr::select("Stratum", "S", "P(S>s|S>=s,E=e)"),
             by = "Stratum"
           ) |>
-          dplyr::group_by(.data$Stratum, .data$E, .data$L) |>
           dplyr::summarize(
-            .groups = "drop",
+            .by = c("Stratum", "E", "L"),
             "P(S>=l|E=e)" = prod(.data$`P(S>s|S>=s,E=e)`[.data$E <= .data$S & .data$S < .data$L])
           )
         # note: can't add `filter(E <= S, S < L)` before summarize() or we would
@@ -425,9 +422,8 @@ fit_joint_model <- function(
         # sum the estimated at-risk and event probabilities by date:
         n_events_by_date <-
           subj_level_possible_data |>
-          dplyr::group_by_at(c("Stratum", "S")) |>
           dplyr::summarize(
-            .groups = "drop",
+            .by = c("Stratum", "S"),
             "n_events" = sum(`P(S=s|e,l,r,o,y)`),
             "risk_probabilities" = sum(`P(S>=s|e,l,r,o,y)`)
           )

diff --git a/R/update_possible_subj_data.R b/R/update_possible_subj_data.R
@@ -74,8 +74,8 @@ update_possible_subj_data <- function(
       ),
       by = c("S", "Stratum")
     ) |>
-    dplyr::group_by("ID") |>
     dplyr::mutate(
+      .by = "ID",
       "P(S>s|S>=l,E=e)" = cumprod(.data$`P(S>s|S>=s,E=e)`),
       # used for next calculation
 
@@ -98,7 +98,6 @@ update_possible_subj_data <- function(
       "P(S>=s|e,l,r,o,y)" = rev(cumsum(rev(.data$`P(S=s|e,l,r,o,y)`)))
       # used to estimate omega
     ) |>
-    dplyr::ungroup() |>
     dplyr::select(
       c(
         "ID",