-
Notifications
You must be signed in to change notification settings - Fork 19
Expand file tree
/
Copy pathAppendix 03 - outliers.R
More file actions
67 lines (46 loc) · 1.57 KB
/
Copy pathAppendix 03 - outliers.R
File metadata and controls
67 lines (46 loc) · 1.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
library(dplyr)
library(ggplot2)
library(performance)
library(datawizard)
library(see)
# There are many rules of thumb for defining what an outlier IS:
# https://doi.org/10.3758/s13428-024-02356-w
tai_missing <- readRDS("data/tai_missing.Rds")
head(tai_missing)
ol_iqr <- check_outliers(tai_missing$moED, method = "iqr", threshold = 1.7)
which(ol_iqr)
# plot(ol_iqr)
# These are univariate outliers according to the IQR method. But there are
# more univariate methods, multivariate methods, and model-based methods.
?check_outliers # see documentation for details about the various methods
# Here we will explore two popular options for dealing with outliers.
# Drop --------------------------------------------------------------------
# This is the easiest option - remove them!
tai_no_OL <- tai_missing |>
filter(!ol_iqr)
# Winzorize ---------------------------------------------------------------
# Replace Extreme Values By Less Extreme Ones
sort(unique(tai_missing$moED[ol_iqr]))
tai_winzorize_OL <- tai_missing |>
mutate(
moED_win = winsorize(moED, threshold = c(1, 19), method = "raw")
)
# Compare -----------------------------------------------------------------
{
par(mfrow = c(3, 1))
hist(
tai_missing$moED,
main = "Original",
xlim = c(0, 25),
breaks = seq(0, 25, 1)
)
hist(tai_no_OL$moED, main = "Omit", xlim = c(0, 25), breaks = seq(0, 25, 1))
hist(
tai_winzorize_OL$moED_win,
main = "Winsorize",
xlim = c(0, 25),
breaks = seq(0, 25, 1)
)
par(mfrow = c(1, 1))
}
# next lesson we will see better plotting methods...