|
| 1 | +#' NNS SD-based Clustering |
| 2 | +#' |
| 3 | +#' Clusters a set of variables by iteratively extracting Stochastic Dominance (SD)-efficient sets, |
| 4 | +#' subject to a minimum cluster size. |
| 5 | +#' |
| 6 | +#' @param data A numeric matrix or data frame of variables to be clustered. |
| 7 | +#' @param degree Numeric options: (1, 2, 3). Degree of stochastic dominance test. |
| 8 | +#' @param type Character, either \code{"continuous"} (default) or \code{"discrete"}; specifies the type of CDF. |
| 9 | +#' @param min_cluster Integer. The minimum number of elements required for a valid cluster. |
| 10 | +#' @param dendrogram Logical; \code{FALSE} (default). If \code{TRUE}, a dendrogram is produced based on a simple "distance" measure between clusters. |
| 11 | +#' |
| 12 | +#' @return |
| 13 | +#' A list with the following components: |
| 14 | +#' \itemize{ |
| 15 | +#' \item \code{Clusters}: A named list of cluster memberships where each element is the set of variable names belonging to that cluster. |
| 16 | +#' \item \code{Dendrogram} (optional): If \code{dendrogram = TRUE}, an \code{hclust} object is also returned. |
| 17 | +#' } |
| 18 | +#' |
| 19 | +#' @details |
| 20 | +#' The function applies \code{\link{NNS.SD.efficient.set}} iteratively, peeling off the SD-efficient set at each step |
| 21 | +#' if it meets or exceeds \code{min_cluster} in size, until no more subsets can be extracted or all variables are exhausted. |
| 22 | +#' Variables in each SD-efficient set form a cluster, with any remaining variables aggregated into the final cluster if it meets |
| 23 | +#' the \code{min_cluster} threshold. |
| 24 | +#' |
| 25 | +#' @author Fred Viole, OVVO Financial Systems |
| 26 | +#' |
| 27 | +#' @references Viole, F. and Nawrocki, D. (2016) "LPM Density Functions for the Computation of the SD Efficient Set." Journal of Mathematical Finance, 6, 105-126. \doi{10.4236/jmf.2016.61012}. |
| 28 | +#' |
| 29 | +#' Viole, F. (2017) "A Note on Stochastic Dominance." \doi{10.2139/ssrn.3002675} |
| 30 | +#' |
| 31 | +#' @examples |
| 32 | +#' \dontrun{ |
| 33 | +#' set.seed(123) |
| 34 | +#' x <- rnorm(100) |
| 35 | +#' y <- rnorm(100) |
| 36 | +#' z <- rnorm(100) |
| 37 | +#' A <- cbind(x, y, z) |
| 38 | +#' |
| 39 | +#' # Perform SD-based clustering (degree 1), requiring at least 2 elements per cluster |
| 40 | +#' results <- NNS.SD.cluster(data = A, degree = 1, min_cluster = 2) |
| 41 | +#' print(results$Clusters) |
| 42 | +#' |
| 43 | +#' # Produce a dendrogram as well |
| 44 | +#' results_with_dendro <- NNS.SD.cluster(data = A, degree = 1, min_cluster = 2, dendrogram = TRUE) |
| 45 | +#' plot(results_with_dendro$Dendrogram) |
| 46 | +#' } |
| 47 | +#' |
| 48 | +#' @export |
| 49 | + |
| 50 | + |
| 51 | +NNS.SD.cluster <- function(data, degree = 1, type = "continuous", min_cluster = 1, dendrogram = FALSE) { |
| 52 | + clusters <- list() |
| 53 | + iteration <- 1 |
| 54 | + |
| 55 | + # Ensure the input data is a matrix |
| 56 | + remaining_data <- as.matrix(data) |
| 57 | + |
| 58 | + # Continue clustering until the number of remaining columns is less than or equal to min_cluster |
| 59 | + while (ncol(remaining_data) > min_cluster) { |
| 60 | + # Use the original NNS.SD.efficient.set call as provided |
| 61 | + SD_set <- NNS.SD.efficient.set(remaining_data, degree = degree, type = type, status = FALSE) |
| 62 | + |
| 63 | + if (length(SD_set) == 0) { |
| 64 | + break |
| 65 | + } |
| 66 | + |
| 67 | + # Store the SD-efficient set as a cluster |
| 68 | + clusters[[paste0("Cluster_", iteration)]] <- SD_set |
| 69 | + |
| 70 | + # Remove the identified SD set from remaining_data |
| 71 | + remaining_data <- remaining_data[, !(colnames(remaining_data) %in% SD_set), drop = FALSE] |
| 72 | + |
| 73 | + # Ensure remaining_data remains a matrix |
| 74 | + remaining_data <- as.matrix(remaining_data) |
| 75 | + |
| 76 | + iteration <- iteration + 1 |
| 77 | + |
| 78 | + # If the number of remaining columns is now less than or equal to min_cluster, add them as the final cluster |
| 79 | + if (ncol(remaining_data) <= min_cluster) { |
| 80 | + clusters[[paste0("Cluster_", iteration)]] <- colnames(remaining_data) |
| 81 | + break |
| 82 | + } |
| 83 | + } |
| 84 | + |
| 85 | + # If there are still variables left (and not already added), add them as the final cluster |
| 86 | + if (ncol(remaining_data) > min_cluster && !paste0("Cluster_", iteration) %in% names(clusters)) { |
| 87 | + clusters[[paste0("Cluster_", iteration)]] <- colnames(remaining_data) |
| 88 | + } |
| 89 | + |
| 90 | + # Check if the final cluster has fewer elements than min_cluster; if so, merge it with the previous cluster (if one exists) |
| 91 | + final_cluster_name <- paste0("Cluster_", length(clusters)) |
| 92 | + if (length(clusters[[final_cluster_name]]) < min_cluster && length(clusters) > 1) { |
| 93 | + previous_cluster_name <- paste0("Cluster_", length(clusters) - 1) |
| 94 | + clusters[[previous_cluster_name]] <- c(clusters[[previous_cluster_name]], clusters[[final_cluster_name]]) |
| 95 | + clusters[[final_cluster_name]] <- NULL |
| 96 | + } |
| 97 | + |
| 98 | + # Flatten the clusters into a single vector and generate cluster labels |
| 99 | + all_vars <- unlist(clusters) |
| 100 | + cluster_labels <- unlist(lapply(seq_along(clusters), function(i) rep(i, length(clusters[[i]])))) |
| 101 | + |
| 102 | + |
| 103 | + if(dendrogram){ |
| 104 | + # Ensure there are at least two variables for hierarchical clustering |
| 105 | + if (length(all_vars) < 2) { |
| 106 | + warning("Not enough variables for hierarchical clustering. Returning clusters only.") |
| 107 | + return(list("Clusters" = clusters, "Dendrogram" = NULL)) |
| 108 | + } |
| 109 | + |
| 110 | + # Create a distance matrix based on cluster labels |
| 111 | + dist_matrix <- as.dist(outer(cluster_labels, cluster_labels, function(a, b) abs(a - b))) |
| 112 | + # For a "dist" object, assign labels using the Labels attribute instead of rownames. |
| 113 | + attr(dist_matrix, "Labels") <- all_vars |
| 114 | + |
| 115 | + # Perform hierarchical clustering |
| 116 | + hc <- hclust(dist_matrix, method = "complete") |
| 117 | + |
| 118 | + plot(hc, |
| 119 | + main = paste0("Hierarchical Clustering of Stochastic Dominance Sets \nSD Degree: ", degree), |
| 120 | + xlab = "Variables", |
| 121 | + ylab = "SD Distance", |
| 122 | + sub = "" |
| 123 | + ) |
| 124 | + |
| 125 | + |
| 126 | + return(list("Clusters" = clusters, "Dendrogram" = hc)) |
| 127 | + } else return(list("Clusters" = clusters)) |
| 128 | +} |
| 129 | + |
0 commit comments