Changed get_cluster_fill_counts() to use tidyselect

stephaniereinders · stephaniereinders · commit 6ac9d7978304 · 2024-11-26T13:28:02.000-06:00
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -33,6 +33,7 @@ Imports:
     rjags,
     stringr,
     tidyr,
+    tidyselect
 Suggests: 
     knitr,
     rmarkdown,
diff --git a/R/cluster_format.R b/R/cluster_format.R
@@ -65,51 +65,40 @@ format_template_data <- function(template) {
 }
 
 
-#' get_cluster_fill_counts
+#' Get Cluster Fill Counts
 #'
 #' `get_cluster_fill_counts()` creates a data frame that shows the number of
 #' graphs in each cluster for each input document.
 #'
-#' @param df A data frame with columns `writer`, `doc`, and `cluster`. Each
-#'   row corresponding to a graph and lists the writer of that graph, the document
-#'   from which the graph was obtained, and the cluster to which that graph is assigned.
-#' @return A dataframe of cluster fill counts for each document in the input data frame.
-#' 
+#' @param df A data frame with columns `docname` and `cluster`. Each row
+#'   corresponding to a graph and lists the document from which the graph was
+#'   obtained, and the cluster to which that graph is assigned. Optionally, the
+#'   data frame might also have `writer` and `doc` columns. If present, `writer`
+#'   lists the writer ID of each document and `doc` is an identifier to
+#'   distinguish between different documents from the same writer.
+#' @return A dataframe of cluster fill counts for each document in the input
+#'   data frame.
+#'
 #' @examples
-#' writer <- c(rep(1, 20), rep(2, 20), rep(3, 20))
 #' docname <- c(rep('doc1',20), rep('doc2', 20), rep('doc3', 20))
+#' writer <- c(rep(1, 20), rep(2, 20), rep(3, 20))
 #' doc <- c(rep(1, 20), rep(2, 20), rep(3, 20))
 #' cluster <- sample(3, 60, replace=TRUE)
 #' df <- data.frame(docname, writer, doc, cluster)
 #' get_cluster_fill_counts(df)
-#' 
+#'
 #' @export
 #' @md
 get_cluster_fill_counts <- function(df) {
   docname <- writer <- doc <- cluster <- n <- NULL
   
-  if (('writer' %in% colnames(df)) && ('doc' %in% colnames(df))) {
-    # count number of graphs in each cluster for each writer
-    cluster_fill_counts <- df %>%
-      dplyr::group_by(docname, writer, doc, cluster) %>%
-      dplyr::summarise(n = dplyr::n()) %>%
-      dplyr::mutate(n = as.integer(n)) %>%
-      tidyr::pivot_wider(names_from = cluster, values_from = n, values_fill = 0)
-    
-    # sort columns
-    cols <- c(colnames(cluster_fill_counts[, c(1, 2, 3)]), sort(as.numeric(colnames(cluster_fill_counts[, -c(1, 2, 3)]))))
-    cluster_fill_counts <- cluster_fill_counts[, cols]
-  } else {
-    cluster_fill_counts <- df %>%
-      dplyr::group_by(docname, cluster) %>%
+  # count number of graphs in each cluster for each writer
+  cluster_fill_counts <- df %>%
+      dplyr::group_by(dplyr::pick(tidyselect::any_of(c("docname", "writer", "doc", "cluster")))) %>%
       dplyr::summarise(n = dplyr::n()) %>%
       dplyr::mutate(n = as.integer(n)) %>%
-      tidyr::pivot_wider(names_from = cluster, values_from = n, values_fill = 0)
-    
-    # sort columns
-    cols <- c(colnames(cluster_fill_counts[, c(1)]), sort(as.numeric(colnames(cluster_fill_counts[, -c(1)]))))
-    cluster_fill_counts <- cluster_fill_counts[, cols]
-  }
+      tidyr::pivot_wider(names_from = cluster, values_from = n, values_fill = 0) %>%
+      dplyr::select(tidyselect::any_of(c("docname", "writer", "doc")), tidyselect::everything())
 
   return(cluster_fill_counts)
 }
diff --git a/man/get_cluster_fill_counts.Rd b/man/get_cluster_fill_counts.Rd