From cb9c48bad523e76eff55711264e2d3a870c63cef Mon Sep 17 00:00:00 2001 From: Katelyn Queen Date: Fri, 1 Nov 2024 12:11:43 -0700 Subject: [PATCH 1/4] Fix bug when first cluster is of size one First cluster to be partitioned is now always the largest cluster; if all clusters are of size one, algorithm will stop --- R/super_partition.R | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/R/super_partition.R b/R/super_partition.R index f39b601..e485f0d 100644 --- a/R/super_partition.R +++ b/R/super_partition.R @@ -182,11 +182,17 @@ super_partition <- function(full_data, width = 100 ) } - - ## first cluster + + # if no dimension reduction, stop + if (length(unique(master_cluster$cluster)) == ncol(full_data)) stop("No dimension reduction occured. Try a lower threshold.") + + ## first cluster - always use largest cluster + clust_sizes <- as.data.frame(table(master_cluster$cluster)) + first_clust <- which(unique(master_cluster$cluster) == clust_sizes[which.max(clust_sizes$Freq), 1]) + # get initial partition to build off part_master <- partition( - full_data[, which(master_cluster$cluster == unique(master_cluster$cluster)[1])], + full_data[, which(master_cluster$cluster == unique(master_cluster$cluster)[first_clust])], threshold, partitioner, tolerance, niter, x, .sep ) @@ -194,7 +200,7 @@ super_partition <- function(full_data, mod_rows <- grep(x, part_master$mapping_key$variable) part_master$mapping_key$indices <- full_data_col_numbers( full_data = full_data, - small_data = full_data[, which(master_cluster$cluster == unique(master_cluster$cluster)[1])], + small_data = full_data[, which(master_cluster$cluster == unique(master_cluster$cluster)[first_clust])], modules = part_master$mapping_key$indices ) @@ -208,7 +214,10 @@ super_partition <- function(full_data, if (progress_bar) pb$tick() # for each cluster... - for (i in 2:n_iter) { + for (i in 1:n_iter) { + # skip if first cluster + if(i == first_clust) next() + # what to do if cluster is of size one if (sum(master_cluster$cluster == unique(master_cluster$cluster)[i]) == 1) { # cbind data to master partition reduced data From a22ce75d666380fab52a5fcfe5ba16f48cb56081 Mon Sep 17 00:00:00 2001 From: Katelyn Queen Date: Mon, 4 Nov 2024 09:56:06 -0800 Subject: [PATCH 2/4] Use Partition if Super Partition results in no dimension reduction --- R/super_partition.R | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/R/super_partition.R b/R/super_partition.R index e485f0d..310eb36 100644 --- a/R/super_partition.R +++ b/R/super_partition.R @@ -73,7 +73,7 @@ super_partition <- function(full_data, # if < cluster_size features, call regular partition if (ncol(full_data) < cluster_size) { message(paste0("Using `partition()` since there are < ", cluster_size, "features.")) - return(partition(full_data, threshold = threshold)) + return(partition(full_data, threshold, partitioner, tolerance, niter, x, .sep)) } # iteration counters @@ -183,8 +183,11 @@ super_partition <- function(full_data, ) } - # if no dimension reduction, stop - if (length(unique(master_cluster$cluster)) == ncol(full_data)) stop("No dimension reduction occured. Try a lower threshold.") + # if no dimension reduction, use partition instead + if (length(unique(master_cluster$cluster)) == ncol(full_data)) { + if (verbose) ("No dimension reduction occured using Super Partition. Using Partition instead.") + return(partition(full_data, threshold, partitioner, tolerance, niter, x, .sep)) + } ## first cluster - always use largest cluster clust_sizes <- as.data.frame(table(master_cluster$cluster)) From 3a0044320113ae1db82ccff6dded0bfca38aa33e Mon Sep 17 00:00:00 2001 From: Katelyn Queen Date: Mon, 4 Nov 2024 10:25:07 -0800 Subject: [PATCH 3/4] Fix typo --- R/super_partition.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/super_partition.R b/R/super_partition.R index 310eb36..732c60c 100644 --- a/R/super_partition.R +++ b/R/super_partition.R @@ -185,7 +185,7 @@ super_partition <- function(full_data, # if no dimension reduction, use partition instead if (length(unique(master_cluster$cluster)) == ncol(full_data)) { - if (verbose) ("No dimension reduction occured using Super Partition. Using Partition instead.") + if (verbose) message("No dimension reduction occured using Super Partition. Using Partition instead.") return(partition(full_data, threshold, partitioner, tolerance, niter, x, .sep)) } From d14c3f1a21a7aa817cafa7d474bb174a60246db6 Mon Sep 17 00:00:00 2001 From: Malcolm Barrett Date: Sun, 10 Nov 2024 13:21:20 -0500 Subject: [PATCH 4/4] clean up --- R/super_partition.R | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/R/super_partition.R b/R/super_partition.R index 732c60c..311820f 100644 --- a/R/super_partition.R +++ b/R/super_partition.R @@ -62,10 +62,14 @@ super_partition <- function(full_data, } # ensure 0 < threshold < 1 - if (0 > threshold | 1 < threshold) stop("Threshold must be between 0 and 1.") + if (0 > threshold | 1 < threshold) { + stop("Threshold must be between 0 and 1.") + } # ensure no column names contain x - if (any(grepl(x, colnames(full_data)))) stop(paste0("The prefix for new variable names, ", x, ", is contained within existing data column names. Please choose a different prefix to avoid errors.")) + if (any(grepl(x, colnames(full_data)))) { + stop(paste0("The prefix for new variable names, ", x, ", is contained within existing data column names. Please choose a different prefix to avoid errors.")) + } # ensure data frame structure full_data <- as.data.frame(full_data) @@ -73,7 +77,16 @@ super_partition <- function(full_data, # if < cluster_size features, call regular partition if (ncol(full_data) < cluster_size) { message(paste0("Using `partition()` since there are < ", cluster_size, "features.")) - return(partition(full_data, threshold, partitioner, tolerance, niter, x, .sep)) + prt <- partition( + full_data, + threshold = threshold, + partitioner = partitioner, + tolerance = tolerance, + niter = niter, + x = x, + .sep = .sep + ) + return(prt) } # iteration counters @@ -182,7 +195,7 @@ super_partition <- function(full_data, width = 100 ) } - + # if no dimension reduction, use partition instead if (length(unique(master_cluster$cluster)) == ncol(full_data)) { if (verbose) message("No dimension reduction occured using Super Partition. Using Partition instead.") @@ -192,7 +205,7 @@ super_partition <- function(full_data, ## first cluster - always use largest cluster clust_sizes <- as.data.frame(table(master_cluster$cluster)) first_clust <- which(unique(master_cluster$cluster) == clust_sizes[which.max(clust_sizes$Freq), 1]) - + # get initial partition to build off part_master <- partition( full_data[, which(master_cluster$cluster == unique(master_cluster$cluster)[first_clust])], @@ -217,10 +230,10 @@ super_partition <- function(full_data, if (progress_bar) pb$tick() # for each cluster... - for (i in 1:n_iter) { + for (i in seq_len(n_iter)) { # skip if first cluster - if(i == first_clust) next() - + if (i == first_clust) next() + # what to do if cluster is of size one if (sum(master_cluster$cluster == unique(master_cluster$cluster)[i]) == 1) { # cbind data to master partition reduced data