SCION/ica_clustering.R at master · nmclark2/SCION · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#Original author Mitch Elmore
#Modified by Natalie Clark for SCION

#### ICAclust ####
#https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0181195

#smaller values of k = more and tighter clusters

#####################################################################################
ica_clustering<-function(clustering_data, k) {

  normmatrix <- t(scale(t(clustering_data), scale=TRUE, center=TRUE)) #center and scale expression matrix

  #for ICA, smaller k = tighter clusters, so we need to convert the threshold
  #k = 1.1-k

  #set.seed(2020)
  X.ICA<-fastICA(normmatrix,n.comp=ncol(normmatrix), alg.typ = "parallel", fun = "logcosh", alpha = 1.0,
                 method = "C", row.norm = FALSE, maxit = 5000, tol = 1e-03, verbose = TRUE)
  hc.ICA<-hclust(dist(X.ICA$S), method = "ward.D", members=NULL)
  mojena=mean(hc.ICA$height)+k*sd(hc.ICA$height)
  cluster_num=length(hc.ICA$height[hc.ICA$height>mojena]) + 1
  clusters<-cutree(hc.ICA, k=cluster_num)
  results <- cbind(normmatrix,clusters)

  #convert matrix to dataframe
  results <- as.data.frame(results)

  #prepare data for plotting
  # plotdata <- as.data.frame(t(results[,1:dim(normmatrix)[2]]))
  # stacked <- stack(plotdata)
  # stacked[,3] =  rep(colnames(results)[1:dim(normmatrix)[2]],dim(plotdata)[2])
  # stacked[,4] = rep(clusters,each=dim(normmatrix)[2])
  # colnames(stacked) <- c('Norm.Intensity','gene','group','cluster')
  # #add means of each cluster as reference lines
  # reflines = by(results[,1:dim(normmatrix)[2]], results$clusters, colMeans)
  # reflinedata <- do.call(cbind, reflines)
  # reflinedata <- as.data.frame(reflinedata)
  # reflinestacked <- stack(reflinedata)
  # reflinestacked[,2] <- rep(colnames(results)[1:dim(normmatrix)[2]],max(clusters))
  # reflinestacked[,3] <- rep(1:max(clusters),each=dim(normmatrix)[2])
  # colnames(reflinestacked) <- c('Norm.Intensity','group','cluster')
  # #plot one cluster per file
  # for (i in 1:max(results$clusters)){
  #   g <- ggplot(data = stacked[stacked$cluster==i,],
  #               mapping = aes(x=group, y=Norm.Intensity, colour = as.factor(cluster), group=gene)) +
  #     geom_line() + theme(legend.position="none")+
  #     scale_x_discrete(limits = unique(stacked$group), labels=colnames(normmatrix))
  #   g <- g + geom_line(data = reflinestacked[reflinestacked$cluster==i,],
  #                      aes(x = group, y = Norm.Intensity, group = cluster), colour='BLACK')
  #   ggplotly(g)
  #   ggsave(paste('Cluster Plots/cluster', i, '.png', sep=""),device="png",width=5, height=3,dpi=600)
  # }
  #finally, save the final data table
  write.csv(results,'clusters.csv')
  return(results)
}