|
| 1 | + |
| 2 | +#' Function for plotting the CDF of all the genes within a gene set or for one gene |
| 3 | +#' |
| 4 | +#' @param x an object of class \code{\link{cit_gsa}} |
| 5 | +#' |
| 6 | +#' @param ... further arguments to be passed |
| 7 | +#' |
| 8 | +#' @return a \code{ggplot2} of gene-wise or geneset ccdf |
| 9 | +#' |
| 10 | +#'@importFrom viridisLite viridis |
| 11 | +#'@import ggplot2 |
| 12 | +#' |
| 13 | +#' @export |
| 14 | +#' |
| 15 | +#' @examples |
| 16 | +#' #TO DO |
| 17 | + |
| 18 | + |
| 19 | + |
| 20 | + |
| 21 | +plot.citcdf <- function(x, ...){ |
| 22 | + |
| 23 | + if(x$type == "gsa"){ |
| 24 | + |
| 25 | + ccdf_all <- x$ccdf |
| 26 | + |
| 27 | + for (i in 1:length(ccdf_all)){ |
| 28 | + ccdf <- ccdf_all[[i]] |
| 29 | + |
| 30 | + genes <- names(ccdf[[1]]) |
| 31 | + |
| 32 | + # test_ccdf[[1]][[1]] # accède à Y1 Y2 names(ccdf_all[[1]][[1]]) |
| 33 | + # test_ccdf[[2]][[1]] # accède à Y3 Y4 names(ccdf_all[[2]][[1]]) |
| 34 | + |
| 35 | + # Create data frame with all the ccdf ---- |
| 36 | + data_gene <- do.call(rbind, lapply(genes, function(name) { |
| 37 | + x <- ccdf[[name]] |
| 38 | + df <- cbind.data.frame(x$cdf, x$ccdf, x$x, x$y) |
| 39 | + colnames(df) <- c("cdf", "ccdf", "x", "y") |
| 40 | + df$Gene <- rep(name, nrow(df)) |
| 41 | + return(df) |
| 42 | + })) |
| 43 | + |
| 44 | + sev <- unique(data_gene$x) |
| 45 | + |
| 46 | + |
| 47 | + # Complete data if no values for some y ---- |
| 48 | + data_gene_sep <- split(data_gene, list(data_gene$Gene, data_gene$x)) # separate the table by genes and x |
| 49 | + |
| 50 | + new_data_gene_sep <- lapply(data_gene_sep, function(df) { |
| 51 | + # max value of current y |
| 52 | + max_y_cur <- max(unique(df$y)) |
| 53 | + # max value of total y |
| 54 | + max_y_all <- max(unique(data_gene$y)) |
| 55 | + |
| 56 | + # all the y that are between max current y value and max total y value |
| 57 | + y_differ_data <- setdiff(data_gene$y, df$y) # y values that are not in current data |
| 58 | + miss_y <- subset(y_differ_data, y_differ_data >= max_y_cur & y_differ_data <= max_y_all) # y values that are not in current data (previous) and between the 2 max |
| 59 | + n_miss_y <- length(miss_y) |
| 60 | + |
| 61 | + # add the new y in the data + affect the max value of the ccdf to these y |
| 62 | + # WARNING : ccdf sorted not y ! |
| 63 | + if (n_miss_y != 0){ |
| 64 | + df <- data.frame(cbind(rep(max(df$cdf),n_miss_y), |
| 65 | + rep(max(df$ccdf),n_miss_y), |
| 66 | + rep(df$x[1],n_miss_y), miss_y, |
| 67 | + rep(df$Gene[1],n_miss_y))) |
| 68 | + colnames(df) <- c("cdf","ccdf","x","y","Gene") |
| 69 | + df$x <- factor(df$x, levels = rep(1:length(levels(data_gene$x))) , labels = levels(data_gene$x) ) |
| 70 | + } else{ |
| 71 | + df <- 0 |
| 72 | + } |
| 73 | + return(df) |
| 74 | + }) |
| 75 | + |
| 76 | + # Combine the data set |
| 77 | + final_data <- do.call(rbind,lapply(genes, function(name){ |
| 78 | + # if some genes doesn't have missing y values |
| 79 | + no_zero <- do.call(rbind,Filter(function(x) {!is.numeric(x) || !all(x == 0)}, new_data_gene_sep)) |
| 80 | + # new dataset completed |
| 81 | + df <- rbind(data_gene[data_gene$Gene==name,], no_zero[no_zero$Gene==name,]) |
| 82 | + df$y <- as.numeric(df$y) |
| 83 | + df$cdf <- as.numeric(df$cdf) |
| 84 | + df$ccdf <- as.numeric(df$ccdf) |
| 85 | + return(df) |
| 86 | + })) |
| 87 | + |
| 88 | + # Thresholds ---- |
| 89 | + Y_after <- final_data$y |
| 90 | + number_y = 20 |
| 91 | + # y value for each thresholds |
| 92 | + y_after <- seq(from = ifelse(length(which(Y_after==0))==0, min(Y_after), min(Y_after[-which(Y_after==0)])), |
| 93 | + to = max(Y_after[-which.max(as.matrix(Y_after))]), length.out = number_y) |
| 94 | + |
| 95 | + seuils <- c(0,y_after) |
| 96 | + |
| 97 | + # Compute the median ---- |
| 98 | + med <- lapply(sev, function(x){ |
| 99 | + med <- rep(NA, number_y) |
| 100 | + filtre_x <- final_data$x == x |
| 101 | + filtre_row_i0 <- final_data$y >= seuils[1] & final_data$y < seuils[1+1] |
| 102 | + indices0 <- which(filtre_x & filtre_row_i0) |
| 103 | + med[1] <- median(final_data$ccdf[indices0]) |
| 104 | + |
| 105 | + for (i in 2:length(seuils)){ |
| 106 | + |
| 107 | + filtre_row_i <- final_data$y >= seuils[i] & final_data$y < seuils[i+1] |
| 108 | + indices <- which(filtre_x & filtre_row_i) |
| 109 | + |
| 110 | + med[i] <- median(final_data$ccdf[indices]) |
| 111 | + |
| 112 | + ref_value <- med[i-1] |
| 113 | + range_value <- final_data[filtre_x & filtre_row_i, ]$ccdf |
| 114 | + |
| 115 | + closest_index <- min(which(range_value > ref_value)) |
| 116 | + closest_value <- range_value[closest_index] |
| 117 | + |
| 118 | + if (med[i] < med[i-1] & closest_index != Inf ){ # force monotony when we can, if no values > previous median leave the current median |
| 119 | + med[i] <- closest_value |
| 120 | + } |
| 121 | + } |
| 122 | + med <- data.frame(med[1:number_y]) |
| 123 | + med$y <- y_after |
| 124 | + med$x <- x |
| 125 | + names(med)[1] <- "ccdf" |
| 126 | + |
| 127 | + return(med) |
| 128 | + }) |
| 129 | + |
| 130 | + # Replace values with the max : if some values are below the max after it |
| 131 | + replace_max <- lapply(med, function(m){ |
| 132 | + ind_max <- which.max(m$ccdf) |
| 133 | + m[ind_max:nrow(m),]$ccdf <- max(m$ccdf) |
| 134 | + return(m) |
| 135 | + }) |
| 136 | + |
| 137 | + # Step function : add the ccdf to the y values that are not a thresholds |
| 138 | + step_function <- lapply(replace_max, function(df){ |
| 139 | + # y values that are not in the data of the thresholds |
| 140 | + new_y <- data.frame(setdiff(final_data$y, df$y)) ; colnames(new_y) <- "y" |
| 141 | + # separate the values between the intervals of the thresholds |
| 142 | + intervalle_indices <- findInterval(new_y$y, df$y, left.open = TRUE) |
| 143 | + indices_int <- intervalle_indices+1 |
| 144 | + |
| 145 | + new_y$ccdf <- df$ccdf[indices_int] |
| 146 | + new_y$x <- unique(df$x) |
| 147 | + |
| 148 | + return(new_y) |
| 149 | + }) |
| 150 | + |
| 151 | + # Data final ----- |
| 152 | + data_med <- rbind(do.call(rbind,step_function),do.call(rbind,med)) |
| 153 | + |
| 154 | + # Data modified for the legend |
| 155 | + data_med$Gene <- "all" |
| 156 | + data_med$Legend <- "Gene set summary" |
| 157 | + |
| 158 | + data_gene <- data_gene[-1] |
| 159 | + data_gene$Legend <- "Genes" |
| 160 | + |
| 161 | + |
| 162 | + comb_data <- rbind(data_gene,data_med) |
| 163 | + |
| 164 | + # Plot ----- |
| 165 | + p <- ggplot(data = comb_data, aes(x = y, color=x, y = ccdf,linetype = Legend, size=Legend)) + |
| 166 | + geom_line(aes(group = interaction(Gene,x))) + |
| 167 | + scale_size_manual(values = c(0.8,0.2)) + # modify row size according to the variable in aes(size) |
| 168 | + labs(x = "Gene expression") + |
| 169 | + theme_minimal() |
| 170 | + |
| 171 | + return(p) |
| 172 | + } |
| 173 | + |
| 174 | + |
| 175 | + } |
| 176 | + |
| 177 | + |
| 178 | + # elseif (x$type == genewise){ |
| 179 | + |
| 180 | + #} |
| 181 | + |
| 182 | + |
| 183 | +} |
| 184 | + |
| 185 | + |
| 186 | + |
| 187 | + |
| 188 | + |
| 189 | + |
| 190 | + |
| 191 | + |
| 192 | + |
| 193 | + |
| 194 | + |
| 195 | + |
0 commit comments