-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathVisualize_Uncertainty.R
113 lines (102 loc) · 5.17 KB
/
Visualize_Uncertainty.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
# Further reading: https://www.pnas.org/doi/full/10.1073/pnas.2302491120
# https://www.pnas.org/doi/full/10.1073/pnas.1913678117
if (!requireNamespace("easypackages", quietly = TRUE)) {
install.packages("easypackages")
}
easypackages::libraries("tidyverse", "tidymodels", "modelr", "brms", "tidybayes", "ggdist", "distributional", "ordinal", "ggrepel", "glue", "emmeans"
)
theme_set(theme_tidybayes())
(df_med <- read_csv("./data/medicine.csv", show_col_types = F) |>
filter(group == 2) |>
mutate(
condition = factor(condition),
participant_id = paste0("p_", str_pad(row_number(), 2, pad = "0"))) |> # Take the row numbers generated by row_number() and pads them with leading zeros
select(-group)
)
# Visualize the original data points
jitter_set <- position_jitter(height = 0.1, seed = 689) # A random seed to make the jitter reproducible
ggplot(
data = df_med,
mapping = aes(x = rank, y = condition)) +
geom_point(position = jitter_set, alpha = 0.5, color = "#7c20c1") +
scale_x_continuous(breaks = min(df_med$rank):max(df_med$rank))
# Build the linear regression model
t_res <- lm(rank ~ - 1 + condition, data = df_med) # -1: the output will change the name of base group from (Intercept) to an actual base group name
# Tidy the linear regression modeling result
t_freq <- t_res |>
tidy() |>
mutate(condition = str_remove(term, "condition"), .before = 1) |>
select(-term)
# str_remove() removes the text "condition" from the term column, and the result is stored in the new condition column. The .before = 1 argument specifies that the new column should be placed as the first column in the data frame.
# Make the predictions
pred_t_res <- predict(object = t_res, newdata = data_grid(df_med, condition),
se.fit = T, interval = "prediction", level = 0.95)
# data_grid generates an evenly spaced grid of points from the data
# Tidy the prediction result
pred_t_freq <- t_res |>
broom::augment(newdata = data_grid(df_med, condition), se_fit = T) |>
mutate(
# Estimate the standard error of prediction
.se.pred = sqrt(.se.fit^2 + pred_t_res$residual.scale^2),
df = pred_t_res$df)
# augment() generates additional columns in the data frame containing the predicted values, and standard errors for the fitted values.
# Variance of prediction is the summation of variance of fit and variance of residual
# Visualize the point predictions per condition, and original data points
t_freq |>
ggplot(mapping = aes(x = estimate, y = condition)) +
geom_point(mapping = aes(x = rank),
position = jitter_set,
data = df_med,
alpha = 0.7,
color = "#7c20c1") +
geom_point(position = position_nudge(y = 0.2)) + # Nudge estimate value per condition vertically by 0.2 units; used for visual separation between points with the same x-values
scale_x_continuous(name = "Rank", # Set the x-axis title to Rank
breaks = min(df_med$rank):max(df_med$rank),
limits = c(0, 10)) +
labs(title = "Frequentist t-test, estimated rank by condition")
# Visualize the point prediction per condition, 66% and 95% confidence interval, and original data points
pred_t_freq |>
ggplot(mapping = aes(y = condition)) +
ggdist::stat_pointinterval(
mapping = aes(xdist = dist_student_t(df = df, mu = .fitted, sigma = .se.fit)),
position = position_nudge(y = .2)) +
ggplot2::geom_point(
data = df_med,
mapping = aes(x = rank),
position = jitter_set,
alpha = 0.7,
color = "#7c20c1"
) +
scale_x_continuous(name = "Rank",
breaks = min(df_med$rank):max(df_med$rank),
limits = c(0, 10)) +
labs(title = "Frequentist t-test, estimated rank by condition",
subtitle = "Confidence interval widths: 0.66 and 0.95")
# Visualize the point prediction per condition, prediction distribution per condition, and original data points
pred_t_freq |>
ggplot(mapping = aes(y = condition)) +
ggdist::stat_slab(
mapping = aes(xdist = dist_student_t(df = df, mu = .fitted, sigma = .se.pred)),
slab_color = "#8c96c6",
fill = NA,
scale = 0.8) + # scale: What proportion of the region allocated to this geom to use to draw the slab. If scale = 1, slabs that use the maximum range will just touch each other.
ggrepel::geom_label_repel(
mapping = aes(estimate + 2, y = condition),
data = t_freq |> slice_tail(), # Select the last rows of the data frame,
color = "#9ECAE1",
label = "Prediction distribution",
box.padding = 1, # There will be some space between the label text and the box enclosing it
position = position_nudge(y = 0.3), max.overlaps = Inf, seed = 15) +
# There is no maximum limit for label overlaps, allowing labels to be placed even if they overlap with each other.
# Using a seed ensures that the same random positioning is reproducible.
ggplot2::geom_point(
data = df_med,
mapping = aes(x = rank),
position = jitter_set,
alpha = 0.7,
color = "#7c20c1"
) +
scale_x_continuous(name = "Rank",
breaks = min(df_med$rank):max(df_med$rank),
limits = c(0, 10)) +
labs(title = "Frequentist t-test, estimated rank by condition")