Skip to content

Commit 9fbd7b8

Browse files
committed
MFQPCA update
1 parent b8c95d7 commit 9fbd7b8

File tree

67 files changed

+2653
-712
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+2653
-712
lines changed

DESCRIPTION

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: FunQ
22
Title: Functional Quantile methodologies
3-
Version: 0.1.9
3+
Version: 0.1.10
44
Authors@R:
55
person("Álvaro", "Méndez-Civieta", , "am5490@cumc.columbia.edu", role = c("aut", "cre"),
66
comment = c(ORCID = "0000-0003-2044-4170"))
@@ -15,6 +15,7 @@ RoxygenNote: 7.3.2
1515
Imports:
1616
quantreg,
1717
conquer,
18+
conquer2,
1819
pbs,
1920
rlang,
2021
tidyr,
@@ -34,4 +35,5 @@ Depends:
3435
R (>= 3.5.0)
3536
Remotes:
3637
tidyfun/tf,
38+
alvaromc317/conquer2
3739
Config/testthat/edition: 3

NAMESPACE

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,17 @@
11
# Generated by roxygen2: do not edit by hand
22

33
S3method(fitted,fqpca_object)
4+
S3method(fitted,mfqpca_object)
45
S3method(plot,fqpca_object)
6+
S3method(plot,mfqpca_object)
57
S3method(predict,fqpca_object)
8+
S3method(predict,mfqpca_object)
69
export(create_folds)
7-
export(cross_validation_df)
8-
export(cross_validation_lambda)
910
export(fqpca)
11+
export(fqpca_cv_df)
12+
export(fqpca_cv_lambda)
13+
export(mfqpca)
14+
export(mfqpca_cv_df)
1015
export(proportion_under_quantile)
1116
export(quantile_error)
1217
export(train_test_split)

R/fqpca-utils.R

Lines changed: 95 additions & 322 deletions
Large diffs are not rendered by default.

R/fqpca.R

Lines changed: 271 additions & 219 deletions
Large diffs are not rendered by default.

R/mfqpca-utils.R

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
2+
#' @title CROSS VALIDATION OF DEGREES OF FREEDOM
3+
#' @description Performs cross validation on degrees of freedom parameter of mfqpca
4+
#' @param data An \eqn{(N \times T)} matrix, a tf object from the tidyfun package or a data.frame containing the functional data as a tf column.
5+
#' @param group Either a string or an array. If it is a string, it must point to the grouping variable in data only if data is a dataframe. If an array, it must be the N dimensional array indicating the hierarchical structure of the data. Elements in the array with the same value indicate they are repeated measures of the same individual.
6+
#' @param colname The name of the column containing the functional data. Use only if data is a dataframe and colname is a column in the dataframe.
7+
#' @param npc.between The number of estimated between level components.
8+
#' @param npc.within The number of estimated within level components.
9+
#' @param pve.between Float between 0 and 1. Percentage of variability explained by between level components. This affects the number of components used in the curve reconstruction and error estimation. Set to NULL to avoid this behavior.
10+
#' @param pve.within Float between 0 and 1. Percentage of variability explained by within level components. This affects the number of components used in the curve reconstruction and error estimation. Set to NULL to avoid this behavior.
11+
#' @param quantile.value The quantile considered.
12+
#' @param periodic Boolean indicating if the data is expected to be periodic (start coincides with end) or not.
13+
#' @param splines.df.grid Grid of possible values for the degrees of freedom.
14+
#' @param splines.method Method used in the resolution of the splines quantile regression model. It currently accepts the methods \code{c('conquer', 'quantreg')}.
15+
#' @param n.folds Number of folds to be used on cross validation.
16+
#' @param return.models Should the list of all the models built be returned?
17+
#' @param criteria Criteria used to divide the data. Valid values are \code{'rows'}, which considers the division based on full rows, or \code{'points'}, which considers the division based on points within the matrix.
18+
#' @param tol Tolerance on the convergence of the algorithm.
19+
#' @param max.iters Maximum number of iterations.
20+
#' @param verbose.mfqpca Boolean indicating verbosity of the fqpca function.
21+
#' @param verbose.cv Boolean indicating verbosity of the cross-validation process.
22+
#' @param seed Seed for the random generator number.
23+
#' @return A list containing the matrices of scores, the matrices of loadings, and a secondary list with extra information.
24+
#' @export
25+
#' @examples
26+
#' n.individuals <- 20
27+
#' n.repeated <- 10
28+
#' n.time = 144
29+
#' N <- n.repeated * n.individuals
30+
#'
31+
#' group <- rep(1:n.individuals, each=n.repeated)
32+
#'
33+
#' # Define score values using a normal distribution
34+
#' c1.vals <- rnorm(n.individuals)
35+
#' c1.vals <- c1.vals[match(group, unique(group))]
36+
#' c2.vals <- rnorm(N)
37+
#'
38+
#' # Define principal components
39+
#' pcb <- sin(seq(0, 2*pi, length.out = n.time))
40+
#' pcw <- cos(seq(0, 2*pi, length.out = n.time))
41+
#'
42+
#' # Generate a data matrix and add missing observations
43+
#' Y <- c1.vals * matrix(pcb, nrow = N, ncol=n.time, byrow = TRUE) +
44+
#' c2.vals * matrix(pcw, nrow = N, ncol=n.time, byrow = TRUE)
45+
#' Y <- Y + matrix(rnorm(N*n.time, 0, 0.4), nrow = N)
46+
#' Y[sample(N*n.time, as.integer(0.2*N))] <- NA
47+
#'
48+
#' cv_results <- mfqpca_cv_df(data = Y, group = group, splines.df.grid = c(5, 10, 15), n.folds = 2)
49+
#'
50+
mfqpca_cv_df <- function(
51+
data, group, colname=NULL, npc.between = 2, npc.within = 2, pve.between = NULL, pve.within = NULL,
52+
quantile.value = 0.5, n.folds = 3, return.models = TRUE, criteria = 'points', periodic = TRUE,
53+
splines.df.grid = c(5, 10, 15, 20), tol = 1e-3, max.iters = 20,
54+
splines.method = 'conquer', verbose.mfqpca = FALSE, verbose.cv = TRUE, seed = NULL)
55+
{
56+
start_time <- Sys.time()
57+
if(!base::is.null(seed)){base::set.seed(seed)}
58+
59+
if(!(n.folds == floor(n.folds))){stop('n.folds must be an integer number. Value provided: ', n.folds)}
60+
if(!(criteria %in% c('rows', 'points'))){stop('Invalid criteria. Valid criterias are c("rows", "points". Value provided: ', criteria)}
61+
62+
# Check the input parameters except Y
63+
mfqpca_check_params(npc.between=npc.between, npc.within=npc.within, quantile.value=quantile.value, periodic=periodic,
64+
splines.df=max(npc.within, npc.between, splines.df.grid), splines.method=splines.method, tol=tol, max.iters=max.iters,
65+
verbose=verbose.mfqpca, seed=seed)
66+
67+
# Check Y and colname and return an unnamed matrix
68+
r <- mfqpca_check_input_data(data=data, colname=colname, group=group)
69+
Y <- r[[1]]
70+
group <- r[[2]]
71+
72+
# KFOLDS
73+
Y.folds <- create_folds(Y, criteria = criteria, folds = n.folds, seed = seed)
74+
75+
# Initialize error storage
76+
error.matrix <- matrix(-1, nrow = length(splines.df.grid), ncol = n.folds)
77+
colnames(error.matrix) <- paste('Fold', 1:n.folds)
78+
list.models <- list()
79+
# CROSS VALIDATION
80+
for(i in seq_along(splines.df.grid))
81+
{
82+
start_loop_time <- Sys.time()
83+
if(verbose.cv){message('Degrees of freedom: ', splines.df.grid[i], ' ---------------------')}
84+
splines.df <- splines.df.grid[i]
85+
86+
if(npc.between > splines.df)warning('The npc.between is larger than splines.df\nCurrent splines.df value: ', splines.df, '\nTaking npc.between=splines.df for this iteration of the Cross-Validation process.')
87+
if(npc.within > splines.df)warning('The npc.within is larger than splines.df\nCurrent splines.df value: ', splines.df, '\nTaking npc.within=splines.df for this iteration of the Cross-Validation process.')
88+
89+
npc.between.iteration <- min(splines.df, npc.between)
90+
npc.within.iteration <- min(splines.df, npc.within)
91+
92+
for(j in 1:n.folds)
93+
{
94+
if(verbose.cv){message(format(Sys.time(), "%Y-%m-%d %H:%M:%S"), '. Fold: ', j)}
95+
96+
# Access data depending on criteria
97+
if(criteria=='rows')
98+
{
99+
Y.train <- Y.folds$Y.train.list[[j]]
100+
Y.test <- Y.folds$Y.test.list[[j]]
101+
} else if(criteria == 'points')
102+
{
103+
Y.train <- Y.folds$Y.train.list[[j]]
104+
Y.test <- Y.folds$Y.test.list[[j]]
105+
} else{stop('Invalid value for criteria. Valid values are observations or curves.')}
106+
107+
# Execute model
108+
mfqpca_results <- mfqpca(
109+
data = Y.train, group = group, colname = colname, npc.between = npc.between.iteration,
110+
npc.within = npc.within.iteration, quantile.value = quantile.value, periodic = periodic,
111+
splines.df = splines.df, splines.method = splines.method, tol = tol,
112+
max.iters = max.iters, verbose = verbose.mfqpca, seed = seed)
113+
114+
if(return.models)
115+
{
116+
name.model <- paste0('df_idx=', i, '.fold=', j)
117+
list.models[[name.model]] <- mfqpca_results
118+
}
119+
120+
# Obtain optimal number of PC
121+
n.components.between <- obtain_npc(scores=mfqpca_results$scores.between, pve=pve.between)
122+
n.components.within <- obtain_npc(scores=mfqpca_results$scores.within, pve=pve.within)
123+
124+
if(criteria == 'points')
125+
{
126+
scores.between.full <- mfqpca_results$scores.between.full
127+
scores.within <- mfqpca_results$scores.within
128+
}else if(criteria=='rows')
129+
{
130+
test.scores <- predict.mfqpca_object(mfqpca_results, Y.test)
131+
scores.between.full <- test.scores$scores.between.full
132+
scores.within <- test.scores$scores.within
133+
}
134+
intercept <- mfqpca_results$intercept
135+
loadings.between <- mfqpca_results$loadings.between
136+
loadings.within <- mfqpca_results$loadings.within
137+
138+
Y.pred =
139+
mfqpca_reconstruct(scores.between.full, loadings.between, n.components.between) +
140+
mfqpca_reconstruct(scores.within, loadings.within, n.components.within)
141+
142+
Y.pred = sweep(Y.pred, MARGIN = 2, STATS = intercept, FUN = "+")
143+
144+
error.matrix[i, j] <- quantile_error(Y = Y.test, Y.pred = Y.pred, quantile.value = quantile.value)
145+
}
146+
147+
end_loop_time <- Sys.time()
148+
if(verbose.cv){message('Degrees of freedom: ', splines.df, '. Execution completed in: ', round(difftime(end_loop_time, start_loop_time, units = "secs"), 3), ' seconds.')}
149+
}
150+
151+
end_time <- Sys.time()
152+
execution.time <- difftime(end_time, start_time, units = "secs")
153+
return(list(error.matrix = error.matrix, execution.time = execution.time, splines.df.grid = splines.df.grid, criteria = criteria, list.models = list.models))
154+
}

0 commit comments

Comments
 (0)