Here's a good
post on
hierarchical clustering. Using data from the datamicroarray
package.
.libPaths('/packages')
my_packages <- c('dendextend', 'remotes', 'datamicroarray')
for (my_package in my_packages){
if(!require(my_package, character.only = TRUE)){
if (my_package == 'datamicroarray'){
install_github('ramhiser/datamicroarray')
} else {
install.packages(my_package, '/packages')
}
library(my_package, character.only = TRUE)
}
}
I will use the yeoh
data set.
data('yeoh', package = "datamicroarray")
dim(yeoh$x)
## [1] 248 12625
table(yeoh$y)
##
## BCR E2A Hyperdip MLL T TEL
## 15 27 64 20 43 79
Calculate distance between all samples.
choose(248, 2)
## [1] 30628
my_dist <- dist(yeoh$x)
summary(my_dist)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 10.65 19.32 21.93 22.40 24.87 48.88
Perform hierarchical clustering using complete (maximum) linkage, which is the default.
my_hclust <- hclust(my_dist)
Form six clusters based on the clustering.
my_clus <- cutree(my_hclust, k = 6)
table(my_clus, yeoh$y)
##
## my_clus BCR E2A Hyperdip MLL T TEL
## 1 13 23 32 3 24 35
## 2 2 0 4 1 0 18
## 3 0 4 9 5 5 10
## 4 0 0 18 10 0 14
## 5 0 0 1 1 3 2
## 6 0 0 0 0 11 0
cluster_one <- yeoh$y[my_clus == 1]
Form n
clusters based on arbitrary distance.
my_clus_two <- cutree(my_hclust, h = 25)
# much more homogeneous
table(my_clus_two, yeoh$y)
##
## my_clus_two BCR E2A Hyperdip MLL T TEL
## 1 12 0 10 0 0 2
## 2 1 0 22 0 0 33
## 3 2 0 3 0 0 17
## 4 0 23 0 3 0 0
## 5 0 4 8 5 1 10
## 6 0 0 16 0 0 0
## 7 0 0 1 0 0 0
## 8 0 0 1 0 0 0
## 9 0 0 1 10 0 0
## 10 0 0 1 1 0 1
## 11 0 0 1 0 0 0
## 12 0 0 0 1 3 2
## 13 0 0 0 0 6 0
## 14 0 0 0 0 18 0
## 15 0 0 0 0 8 0
## 16 0 0 0 0 3 0
## 17 0 0 0 0 4 0
## 18 0 0 0 0 0 14
Plot.
my_hclust_mod <- my_hclust
my_hclust_mod$labels <- as.vector(yeoh$y)
plot(color_branches(my_hclust_mod, h = 25, groupLabels = TRUE))
Time built.
## [1] "2022-10-20 06:51:24 UTC"
Session info.
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/liblapack.so.3
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] datamicroarray_0.2.3 remotes_2.4.2 dendextend_1.16.0
## [4] forcats_0.5.1 stringr_1.4.0 dplyr_1.0.9
## [7] purrr_0.3.4 readr_2.1.2 tidyr_1.2.0
## [10] tibble_3.1.7 ggplot2_3.3.6 tidyverse_1.3.1
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.1.2 xfun_0.31 haven_2.5.0 colorspace_2.0-3
## [5] vctrs_0.4.1 generics_0.1.3 viridisLite_0.4.0 htmltools_0.5.2
## [9] yaml_2.3.5 utf8_1.2.2 rlang_1.0.3 pillar_1.7.0
## [13] glue_1.6.2 withr_2.5.0 DBI_1.1.3 dbplyr_2.2.1
## [17] modelr_0.1.8 readxl_1.4.0 lifecycle_1.0.1 munsell_0.5.0
## [21] gtable_0.3.0 cellranger_1.1.0 rvest_1.0.2 evaluate_0.15
## [25] knitr_1.39 tzdb_0.3.0 fastmap_1.1.0 fansi_1.0.3
## [29] highr_0.9 broom_1.0.0 scales_1.2.0 backports_1.4.1
## [33] jsonlite_1.8.0 fs_1.5.2 gridExtra_2.3 hms_1.1.1
## [37] digest_0.6.29 stringi_1.7.6 grid_4.2.1 cli_3.3.0
## [41] tools_4.2.1 magrittr_2.0.3 crayon_1.5.1 pkgconfig_2.0.3
## [45] ellipsis_0.3.2 xml2_1.3.3 reprex_2.0.1 lubridate_1.8.0
## [49] viridis_0.6.2 rstudioapi_0.13 assertthat_0.2.1 rmarkdown_2.14
## [53] httr_1.4.3 R6_2.5.1 compiler_4.2.1