Skip to content

Latest commit

 

History

History
266 lines (231 loc) · 9.04 KB

File metadata and controls

266 lines (231 loc) · 9.04 KB

QDA by Projection Real Data

Breast Cancer

$p=9$, $n=699$. Repeat $m=100$ times for average test error.

seed <- 2020
m <- 300
per <- c(0.4, 0.5, 0.6, 0.7, 0.8)
bc <- read.csv("real_data/breast-cancer-wisconsin.data", header = FALSE)
y <- bc[, 11]
id0 <- which(y == 2)
id1 <- which(y == 4)
y[id0] <- 0
y[id1] <- 1
x <- data.matrix(bc[, 2:10])
pct.LDAQDARDADSDADAPQDAP
0.44.78 (0.05)5.29 (0.06)4.50 (0.05)5.14 (0.05)4.61 (0.06)3.52 (0.03)
0.54.80 (0.05)5.19 (0.06)4.42 (0.05)5.02 (0.06)4.48 (0.06)3.40 (0.04)
0.64.62 (0.06)5.02 (0.07)4.23 (0.06)4.87 (0.06)4.24 (0.06)3.30 (0.04)
0.74.72 (0.07)5.13 (0.08)4.27 (0.07)4.95 (0.08)4.23 (0.07)3.33 (0.06)
0.84.66 (0.09)5.09 (0.10)4.06 (0.08)4.70 (0.09)4.24 (0.09)3.29 (0.08)

Ultrasonic Flowmeter

$p=36$, $n=87$. Repeat $m=100$ times for average test error.

seed <- 2020
m <- 300
per <- c(0.3, 0.4, 0.5, 0.6, 0.7, 0.8)
fm <- read.table("real_data/Meter A.data")
y <- fm[, 37]
id0 <- which(y == 2)
id1 <- which(y == 1)
y[id0] <- 0
y[id1] <- 1
x <- data.matrix(fm[, 1:36])
pct.LDARDADSDADAPQDAP
0.325.87 (0.55)37.20 (0.45)25.21 (0.69)19.67 (0.38)10.64 (0.36)
0.412.58 (0.50)35.37 (0.44)18.76 (0.65)17.33 (0.38)4.10 (0.22)
0.53.39 (0.22)33.50 (0.40)4.33 (0.33)15.34 (0.38)1.40 (0.11)
0.61.58 (0.11)34.05 (0.38)2.94 (0.26)15.52 (0.41)0.89 (0.08)
0.70.81 (0.10)33.64 (0.40)2.02 (0.20)13.83 (0.40)0.64 (0.08)
0.80.43 (0.09)34.08 (0.53)1.22 (0.16)12.73 (0.45)0.69 (0.11)

Heart Disease

$p=13$, $n=303$. Repeat $m=300$ times for average test error.

seed <- 2020
m <- 300
per <- c(0.4, 0.5, 0.6, 0.7, 0.8)
hd <- read.csv("real_data/heart.csv")
y <- hd[, 14]
id0 <- which(y == 0)
id1 <- which(y == 1)
y[id0] <- 0
y[id1] <- 1
x <- data.matrix(hd[, 1:13])
pct.LDAQDARDADSDADAPQDAP
0.418.82 (0.14)22.23 (0.15)19.15 (0.17)19.28 (0.15)20.10 (0.20)18.55 (0.15)
0.517.93 (0.15)21.67 (0.15)17.91 (0.17)18.39 (0.16)18.86 (0.18)17.75 (0.15)
0.617.81 (0.17)20.86 (0.18)17.56 (0.18)18.00 (0.17)18.43 (0.19)17.48 (0.17)
0.717.22 (0.18)20.26 (0.21)16.90 (0.19)17.33 (0.17)17.42 (0.18)16.93 (0.18)
0.817.54 (0.24)19.98 (0.26)17.02 (0.24)17.66 (0.23)17.46 (0.26)17.14 (0.24)

Segment Data

seed <- 2020
m <- 300
per <- c(0.4, 0.5, 0.6, 0.7, 0.8)
sd <- read.table("./real_data/segment.dat")
sd <- sd[which(sd[, 20] %in% c(1, 4)), ]
y <- sd[, 20]
id0 <- which(y == 4)
id1 <- which(y == 1)
y[id0] <- 0
y[id1] <- 1
x <- data.matrix(sd[, c(2, 6:19)])
pct.LDARDADSDADAPQDAP
0.40.73 (0.02)0.86 (0.03)0.92 (0.03)1.70 (0.04)0.73 (0.02)
0.50.74 (0.02)0.79 (0.03)0.90 (0.03)1.68 (0.04)0.73 (0.02)
0.60.72 (0.02)0.78 (0.03)0.84 (0.03)1.64 (0.04)0.69 (0.02)
0.70.73 (0.03)0.70 (0.04)0.86 (0.03)1.57 (0.05)0.71 (0.03)
0.80.76 (0.04)0.67 (0.04)0.89 (0.04)1.59 (0.06)0.73 (0.03)

Satellite Image

seed <- 2020
m <- 300
per <- c(0.4, 0.5, 0.6, 0.7, 0.8)
si <- read.table("./real_data/sat.trn")
si <- si[which(si[, 37] %in% c(1, 3)), ]
y <- si[, 37]
id0 <- which(y == 3)
id1 <- which(y == 1)
y[id0] <- 0
y[id1] <- 1
x <- data.matrix(si[, 1:36])
pct.LDAQDARDADSDADAPQDAP
0.41.40 (0.01)1.94 (0.02)1.39 (0.01)1.43 (0.01)1.61 (0.02)1.37 (0.01)
0.51.37 (0.02)1.90 (0.02)1.37 (0.02)1.41 (0.02)1.56 (0.02)1.32 (0.02)
0.61.37 (0.02)1.79 (0.03)1.38 (0.02)1.39 (0.02)1.54 (0.02)1.32 (0.02)
0.71.33 (0.02)1.78 (0.03)1.36 (0.02)1.37 (0.02)1.59 (0.03)1.29 (0.02)
0.81.31 (0.03)1.70 (0.03)1.34 (0.03)1.36 (0.03)1.55 (0.04)1.27 (0.03)

Various Code Blocks

Data Split Code

set.seed(seed)
id <- datasplit(id0 = id0, id1 = id1,
                   m = m, per = per)

Models Evaluation Code

len_per <- length(per)
pred_err <- vector("list", len_per)
if (parallel == TRUE) {
  Sys.setenv(OMP_NUM_THREADS = 1)
  for (i in 1:len_per) {
    pred_err[[i]] <- foreach(j = 1:m, .combine = rbind,
                             .options.RNG = seed) %dorng%
      data_analysis(x = x[id[[i]][[j]], ], y = y[id[[i]][[j]]],
                    xnew = x[-id[[i]][[j]], ], ynew = y[-id[[i]][[j]]],
                    qdap = qdap, lda = lda, qda = qda, dsda = dsda,
                    sqda = sqda, rda = rda)
  }
  Sys.setenv(OMP_NUM_THREADS = 4)
} else {
  for (i in 1:len_per) {
    for (j in 1:m) {
      pred_err[[i]] <-
        rbind(pred_err[[i]],
              data_analysis(x = x[id[[i]][[j]], ], y = y[id[[i]][[j]]],
                            xnew = x[-id[[i]][[j]], ], ynew = y[-id[[i]][[j]]],
                            qdap = qdap, lda = lda, qda = qda, dsda = dsda,
                            sqda = sqda, rda = rda))
    }
  }
}
obj_name <- paste0("pred_err_", name)
assign(obj_name, pred_err)
if (test == FALSE)
  resave(list = obj_name, file = "out/real_data_summary.RData")

Data Summary Code

obj_name <- paste0("pred_err_", name)
out <- paste0("out/", name, ".pdf")
len_per <- length(per)
data_summarized <- NULL
for (i in 1:len_per) {
  data_summarized <-
    rbind(data_summarized,
          data.frame(summary_se(get(obj_name)[[i]]), pct. = per[i]))
}
pdf(out)
pd <- position_dodge(0.01)
plot <- ggplot(data_summarized, aes(x = pct., y = prediction.error,
                                    colour = method)) +
  scale_x_continuous(breaks = per) +
  geom_errorbar(aes(ymin = prediction.error - ci.95,
                    ymax = prediction.error + ci.95),
                width = .02, position = pd) +
  geom_line(position = pd) +
  geom_point(position = pd)
print(plot)
dev.off()
data_summarized %>%
  dplyr::select(- ci.95) %>%
  mutate(prediction.error
         = format(round(prediction.error * 100, 2), nsmall = 2)) %>%
  mutate(standard.error = paste0("(", format(round(standard.error * 100, 2),
                                             nsmall = 2), ")")) %>%
  unite(col = prediction.error, prediction.error, standard.error, sep = " ") %>%
  spread(key = method, value = prediction.error)