|
| 1 | + |
| 2 | +#' @title CROSS VALIDATION OF DEGREES OF FREEDOM |
| 3 | +#' @description Performs cross validation on degrees of freedom parameter of mfqpca |
| 4 | +#' @param data An \eqn{(N \times T)} matrix, a tf object from the tidyfun package or a data.frame containing the functional data as a tf column. |
| 5 | +#' @param group Either a string or an array. If it is a string, it must point to the grouping variable in data only if data is a dataframe. If an array, it must be the N dimensional array indicating the hierarchical structure of the data. Elements in the array with the same value indicate they are repeated measures of the same individual. |
| 6 | +#' @param colname The name of the column containing the functional data. Use only if data is a dataframe and colname is a column in the dataframe. |
| 7 | +#' @param npc.between The number of estimated between level components. |
| 8 | +#' @param npc.within The number of estimated within level components. |
| 9 | +#' @param pve.between Float between 0 and 1. Percentage of variability explained by between level components. This affects the number of components used in the curve reconstruction and error estimation. Set to NULL to avoid this behavior. |
| 10 | +#' @param pve.within Float between 0 and 1. Percentage of variability explained by within level components. This affects the number of components used in the curve reconstruction and error estimation. Set to NULL to avoid this behavior. |
| 11 | +#' @param quantile.value The quantile considered. |
| 12 | +#' @param periodic Boolean indicating if the data is expected to be periodic (start coincides with end) or not. |
| 13 | +#' @param splines.df.grid Grid of possible values for the degrees of freedom. |
| 14 | +#' @param splines.method Method used in the resolution of the splines quantile regression model. It currently accepts the methods \code{c('conquer', 'quantreg')}. |
| 15 | +#' @param n.folds Number of folds to be used on cross validation. |
| 16 | +#' @param return.models Should the list of all the models built be returned? |
| 17 | +#' @param criteria Criteria used to divide the data. Valid values are \code{'rows'}, which considers the division based on full rows, or \code{'points'}, which considers the division based on points within the matrix. |
| 18 | +#' @param tol Tolerance on the convergence of the algorithm. |
| 19 | +#' @param max.iters Maximum number of iterations. |
| 20 | +#' @param verbose.mfqpca Boolean indicating verbosity of the fqpca function. |
| 21 | +#' @param verbose.cv Boolean indicating verbosity of the cross-validation process. |
| 22 | +#' @param seed Seed for the random generator number. |
| 23 | +#' @return A list containing the matrices of scores, the matrices of loadings, and a secondary list with extra information. |
| 24 | +#' @export |
| 25 | +#' @examples |
| 26 | +#' n.individuals <- 20 |
| 27 | +#' n.repeated <- 10 |
| 28 | +#' n.time = 144 |
| 29 | +#' N <- n.repeated * n.individuals |
| 30 | +#' |
| 31 | +#' group <- rep(1:n.individuals, each=n.repeated) |
| 32 | +#' |
| 33 | +#' # Define score values using a normal distribution |
| 34 | +#' c1.vals <- rnorm(n.individuals) |
| 35 | +#' c1.vals <- c1.vals[match(group, unique(group))] |
| 36 | +#' c2.vals <- rnorm(N) |
| 37 | +#' |
| 38 | +#' # Define principal components |
| 39 | +#' pcb <- sin(seq(0, 2*pi, length.out = n.time)) |
| 40 | +#' pcw <- cos(seq(0, 2*pi, length.out = n.time)) |
| 41 | +#' |
| 42 | +#' # Generate a data matrix and add missing observations |
| 43 | +#' Y <- c1.vals * matrix(pcb, nrow = N, ncol=n.time, byrow = TRUE) + |
| 44 | +#' c2.vals * matrix(pcw, nrow = N, ncol=n.time, byrow = TRUE) |
| 45 | +#' Y <- Y + matrix(rnorm(N*n.time, 0, 0.4), nrow = N) |
| 46 | +#' Y[sample(N*n.time, as.integer(0.2*N))] <- NA |
| 47 | +#' |
| 48 | +#' cv_results <- mfqpca_cv_df(data = Y, group = group, splines.df.grid = c(5, 10, 15), n.folds = 2) |
| 49 | +#' |
| 50 | +mfqpca_cv_df <- function( |
| 51 | + data, group, colname=NULL, npc.between = 2, npc.within = 2, pve.between = NULL, pve.within = NULL, |
| 52 | + quantile.value = 0.5, n.folds = 3, return.models = TRUE, criteria = 'points', periodic = TRUE, |
| 53 | + splines.df.grid = c(5, 10, 15, 20), tol = 1e-3, max.iters = 20, |
| 54 | + splines.method = 'conquer', verbose.mfqpca = FALSE, verbose.cv = TRUE, seed = NULL) |
| 55 | +{ |
| 56 | + start_time <- Sys.time() |
| 57 | + if(!base::is.null(seed)){base::set.seed(seed)} |
| 58 | + |
| 59 | + if(!(n.folds == floor(n.folds))){stop('n.folds must be an integer number. Value provided: ', n.folds)} |
| 60 | + if(!(criteria %in% c('rows', 'points'))){stop('Invalid criteria. Valid criterias are c("rows", "points". Value provided: ', criteria)} |
| 61 | + |
| 62 | + # Check the input parameters except Y |
| 63 | + mfqpca_check_params(npc.between=npc.between, npc.within=npc.within, quantile.value=quantile.value, periodic=periodic, |
| 64 | + splines.df=max(npc.within, npc.between, splines.df.grid), splines.method=splines.method, tol=tol, max.iters=max.iters, |
| 65 | + verbose=verbose.mfqpca, seed=seed) |
| 66 | + |
| 67 | + # Check Y and colname and return an unnamed matrix |
| 68 | + r <- mfqpca_check_input_data(data=data, colname=colname, group=group) |
| 69 | + Y <- r[[1]] |
| 70 | + group <- r[[2]] |
| 71 | + |
| 72 | + # KFOLDS |
| 73 | + Y.folds <- create_folds(Y, criteria = criteria, folds = n.folds, seed = seed) |
| 74 | + |
| 75 | + # Initialize error storage |
| 76 | + error.matrix <- matrix(-1, nrow = length(splines.df.grid), ncol = n.folds) |
| 77 | + colnames(error.matrix) <- paste('Fold', 1:n.folds) |
| 78 | + list.models <- list() |
| 79 | + # CROSS VALIDATION |
| 80 | + for(i in seq_along(splines.df.grid)) |
| 81 | + { |
| 82 | + start_loop_time <- Sys.time() |
| 83 | + if(verbose.cv){message('Degrees of freedom: ', splines.df.grid[i], ' ---------------------')} |
| 84 | + splines.df <- splines.df.grid[i] |
| 85 | + |
| 86 | + if(npc.between > splines.df)warning('The npc.between is larger than splines.df\nCurrent splines.df value: ', splines.df, '\nTaking npc.between=splines.df for this iteration of the Cross-Validation process.') |
| 87 | + if(npc.within > splines.df)warning('The npc.within is larger than splines.df\nCurrent splines.df value: ', splines.df, '\nTaking npc.within=splines.df for this iteration of the Cross-Validation process.') |
| 88 | + |
| 89 | + npc.between.iteration <- min(splines.df, npc.between) |
| 90 | + npc.within.iteration <- min(splines.df, npc.within) |
| 91 | + |
| 92 | + for(j in 1:n.folds) |
| 93 | + { |
| 94 | + if(verbose.cv){message(format(Sys.time(), "%Y-%m-%d %H:%M:%S"), '. Fold: ', j)} |
| 95 | + |
| 96 | + # Access data depending on criteria |
| 97 | + if(criteria=='rows') |
| 98 | + { |
| 99 | + Y.train <- Y.folds$Y.train.list[[j]] |
| 100 | + Y.test <- Y.folds$Y.test.list[[j]] |
| 101 | + } else if(criteria == 'points') |
| 102 | + { |
| 103 | + Y.train <- Y.folds$Y.train.list[[j]] |
| 104 | + Y.test <- Y.folds$Y.test.list[[j]] |
| 105 | + } else{stop('Invalid value for criteria. Valid values are observations or curves.')} |
| 106 | + |
| 107 | + # Execute model |
| 108 | + mfqpca_results <- mfqpca( |
| 109 | + data = Y.train, group = group, colname = colname, npc.between = npc.between.iteration, |
| 110 | + npc.within = npc.within.iteration, quantile.value = quantile.value, periodic = periodic, |
| 111 | + splines.df = splines.df, splines.method = splines.method, tol = tol, |
| 112 | + max.iters = max.iters, verbose = verbose.mfqpca, seed = seed) |
| 113 | + |
| 114 | + if(return.models) |
| 115 | + { |
| 116 | + name.model <- paste0('df_idx=', i, '.fold=', j) |
| 117 | + list.models[[name.model]] <- mfqpca_results |
| 118 | + } |
| 119 | + |
| 120 | + # Obtain optimal number of PC |
| 121 | + n.components.between <- obtain_npc(scores=mfqpca_results$scores.between, pve=pve.between) |
| 122 | + n.components.within <- obtain_npc(scores=mfqpca_results$scores.within, pve=pve.within) |
| 123 | + |
| 124 | + if(criteria == 'points') |
| 125 | + { |
| 126 | + scores.between.full <- mfqpca_results$scores.between.full |
| 127 | + scores.within <- mfqpca_results$scores.within |
| 128 | + }else if(criteria=='rows') |
| 129 | + { |
| 130 | + test.scores <- predict.mfqpca_object(mfqpca_results, Y.test) |
| 131 | + scores.between.full <- test.scores$scores.between.full |
| 132 | + scores.within <- test.scores$scores.within |
| 133 | + } |
| 134 | + intercept <- mfqpca_results$intercept |
| 135 | + loadings.between <- mfqpca_results$loadings.between |
| 136 | + loadings.within <- mfqpca_results$loadings.within |
| 137 | + |
| 138 | + Y.pred = |
| 139 | + mfqpca_reconstruct(scores.between.full, loadings.between, n.components.between) + |
| 140 | + mfqpca_reconstruct(scores.within, loadings.within, n.components.within) |
| 141 | + |
| 142 | + Y.pred = sweep(Y.pred, MARGIN = 2, STATS = intercept, FUN = "+") |
| 143 | + |
| 144 | + error.matrix[i, j] <- quantile_error(Y = Y.test, Y.pred = Y.pred, quantile.value = quantile.value) |
| 145 | + } |
| 146 | + |
| 147 | + end_loop_time <- Sys.time() |
| 148 | + if(verbose.cv){message('Degrees of freedom: ', splines.df, '. Execution completed in: ', round(difftime(end_loop_time, start_loop_time, units = "secs"), 3), ' seconds.')} |
| 149 | + } |
| 150 | + |
| 151 | + end_time <- Sys.time() |
| 152 | + execution.time <- difftime(end_time, start_time, units = "secs") |
| 153 | + return(list(error.matrix = error.matrix, execution.time = execution.time, splines.df.grid = splines.df.grid, criteria = criteria, list.models = list.models)) |
| 154 | +} |
0 commit comments