Visualize_Uncertainty.R

# Further reading: https://www.pnas.org/doi/full/10.1073/pnas.2302491120
# https://www.pnas.org/doi/full/10.1073/pnas.1913678117

if (!requireNamespace("easypackages", quietly = TRUE)) {
  install.packages("easypackages")
}
easypackages::libraries("tidyverse", "tidymodels", "modelr", "brms", "tidybayes", "ggdist", "distributional", "ordinal", "ggrepel", "glue", "emmeans"
)
theme_set(theme_tidybayes())

(df_med <- read_csv("./data/medicine.csv", show_col_types = F) |>
  filter(group == 2) |>
  mutate(
    condition = factor(condition), 
    participant_id = paste0("p_", str_pad(row_number(), 2, pad = "0"))) |> # Take the row numbers generated by row_number() and pads them with leading zeros
  select(-group)
)

# Visualize the original data points
jitter_set <- position_jitter(height = 0.1, seed = 689) # A random seed to make the jitter reproducible
ggplot(
  data = df_med,
  mapping = aes(x = rank, y = condition)) +     
  geom_point(position = jitter_set, alpha = 0.5, color = "#7c20c1") + 
  scale_x_continuous(breaks = min(df_med$rank):max(df_med$rank))

# Build the linear regression model
t_res <- lm(rank ~ - 1 + condition, data = df_med) # -1: the output will change the name of base group from (Intercept) to an actual base group name

# Tidy the linear regression modeling result
t_freq <- t_res |> 
   tidy() |>
   mutate(condition = str_remove(term, "condition"), .before = 1) |> 
   select(-term)
# str_remove() removes the text "condition" from the term column, and the result is stored in the new condition column. The .before = 1 argument specifies that the new column should be placed as the first column in the data frame.

# Make the predictions
pred_t_res <- predict(object = t_res, newdata = data_grid(df_med, condition), 
                      se.fit = T, interval = "prediction", level = 0.95)
# data_grid generates an evenly spaced grid of points from the data

# Tidy the prediction result
pred_t_freq <- t_res |>
  broom::augment(newdata = data_grid(df_med, condition), se_fit = T) |>
  mutate(
    # Estimate the standard error of prediction
    .se.pred = sqrt(.se.fit^2 + pred_t_res$residual.scale^2), 
    df = pred_t_res$df) 

# augment() generates additional columns in the data frame containing the predicted values, and standard errors for the fitted values.
# Variance of prediction is the summation of variance of fit and variance of residual

# Visualize the point predictions per condition, and original data points
t_freq |>
  ggplot(mapping = aes(x = estimate, y = condition)) + 
  geom_point(mapping = aes(x = rank), 
             position = jitter_set, 
             data = df_med, 
             alpha = 0.7, 
             color = "#7c20c1") +
  geom_point(position = position_nudge(y = 0.2)) + # Nudge estimate value per condition vertically by 0.2 units; used for visual separation between points with the same x-values
  scale_x_continuous(name = "Rank", # Set the x-axis title to Rank
                     breaks = min(df_med$rank):max(df_med$rank), 
                     limits = c(0, 10)) + 
  labs(title = "Frequentist t-test, estimated rank by condition")

# Visualize the point prediction per condition, 66% and 95% confidence interval, and original data points
pred_t_freq |>
  ggplot(mapping = aes(y = condition)) + 
  ggdist::stat_pointinterval(
    mapping = aes(xdist = dist_student_t(df = df, mu = .fitted, sigma = .se.fit)),
    position = position_nudge(y = .2)) +
  ggplot2::geom_point(
    data = df_med, 
    mapping = aes(x = rank),
    position = jitter_set,
    alpha = 0.7, 
    color = "#7c20c1"
   ) +
  scale_x_continuous(name = "Rank", 
                     breaks = min(df_med$rank):max(df_med$rank), 
                     limits = c(0, 10)) + 
  labs(title = "Frequentist t-test, estimated rank by condition",
       subtitle = "Confidence interval widths: 0.66 and 0.95")

# Visualize the point prediction per condition, prediction distribution per condition, and original data points
pred_t_freq |>
  ggplot(mapping = aes(y = condition)) + 
  ggdist::stat_slab(
    mapping = aes(xdist = dist_student_t(df = df, mu = .fitted, sigma = .se.pred)),
    slab_color = "#8c96c6",
    fill = NA,
    scale = 0.8) + # scale: What proportion of the region allocated to this geom to use to draw the slab. If scale = 1, slabs that use the maximum range will just touch each other.
  ggrepel::geom_label_repel(
    mapping = aes(estimate + 2, y = condition),
    data = t_freq |> slice_tail(), # Select the last rows of the data frame, 
    color = "#9ECAE1",
    label = "Prediction distribution", 
    box.padding = 1, # There will be some space between the label text and the box enclosing it
    position = position_nudge(y = 0.3), max.overlaps = Inf, seed = 15) +
    # There is no maximum limit for label overlaps, allowing labels to be placed even if they overlap with each other.
    # Using a seed ensures that the same random positioning is reproducible.
  ggplot2::geom_point(
    data = df_med, 
    mapping = aes(x = rank),
    position = jitter_set,
    alpha = 0.7, 
    color = "#7c20c1"
   ) +
  scale_x_continuous(name = "Rank", 
                     breaks = min(df_med$rank):max(df_med$rank), 
                     limits = c(0, 10)) + 
  labs(title = "Frequentist t-test, estimated rank by condition")