-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathunsupervised_machine_learning_script.R
223 lines (177 loc) · 7.56 KB
/
unsupervised_machine_learning_script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
#defining data sets
terp <- read.csv("~/Desktop/f2_terp.csv")
data = read.csv("f2_terp.csv")[,2:8]
X = cor(data,use="p")
ag <- read.csv("/Users/admin/Desktop/ag_f2.csv")
data = read.csv("ag_f2.csv")[,2:9]
X = cor(data,use="p")
cannabinoid <- read.csv("~/Desktop/cannabinoid_f2.csv")
data = read.csv("cannabinoid_f2.csv")[,2:10]
X = cor(data,use="p")
alltraits <- read.csv("~/Desktop/alltraits_f2.csv")
data = read.csv("alltraits_f2.csv")[,2:25]
X = cor(data,use="p")
#Unsupervised Machine Learning Plots: PCA, Hierarchical Clusters, and Graphical Model
# PCA
pca = prcomp(X)
biplot(pca,col=c(0,1))
#Hierarchical clusters
Dst = dist(t(data))
Clust = hclust(Dst)
plot(Clust)
#Graphical model (lambda 0.45)
#install.packages("huge")
require(huge)
GGM = function(X,L) plot(graph.adjacency(as.matrix(huge(X,lambda=L,method="glasso",verbose=F)$path[[1]]),mode="undirected",diag=F),vertex.label = colnames(X),vertex.size=0)
GGM(X,0.45)
#pca of correlation of cavariance
#data = read.csv("Vg.csv")[,2:8]
X = cor(data,use="p")
#data = read.csv("Vg_cannabinoid.csv")[,2:10]
X = cor(data,use="p")
data = read.csv("Vg_ag.csv")[,2:9]
X = cor(data,use="p")
#Transpose original dataset and saving transposed dataset (only necessary if needed to have both datasets in equivalent orientation)
cannabinoidsf2 <- read.csv("~/Desktop/hd_markers_sr.csv")
hdmarker <- t(cannabinoidsf2)
write.csv(hdmarker, file = "hdmarkercannabinoids.csv")
#Reading in transposed dataset
cannabinoid_markers <- read.csv("~/Desktop/hdmarkercannabinoids.csv")
#Defining markers and saving file (should rename to marker data matrix as all phenotypic data from same mapping population)
terp_markers <- read.csv("hdmarker.csv", row.names=1, stringsAsFactors=FALSE)
terp_markers[terp_markers=="a"]=-1
terp_markers[terp_markers=="h"]=0
terp_markers[terp_markers=="b"]=1
terp_markers=data.matrix(terp_markers)
write.csv(terp_markers, file = "terp_markers2.csv")
cannabinoid_markers <- read.csv("hdmarkercannabinoids.csv", row.names=1, stringsAsFactors=FALSE)
cannabinoid_markers[cannabinoid_markers=="a"]=-1
cannabinoid_markers[cannabinoid_markers=="h"]=0
cannabinoid_markers[cannabinoid_markers=="b"]=1
cannabinoid_markers=data.matrix(cannabinoid_markers)
write.csv(cannabinoid_markers, file = "cannabinoid_markers.csv")
#Install NAM for analyses below
#install.packages("NAM")
library(NAM)
#IMP(X): Imputes missing points from matrix X with the average value of the column. This function does not return anything, rather it modifies X directly. Incase of missing data.
IMP(terp_markers)
#GAU(X): Created a Gaussian kernel from matrix X.
G = GAU(terp_markers)
G = GAU(cannabinoid_markers)
#Define phenotypic datasets
terp_pheno <- read.csv("f2_terp.csv", row.names=1, stringsAsFactors=FALSE)
cannabinoid_pheno <- read.csv("cannabinoid_f2.csv", row.names=1, stringsAsFactors=FALSE)
ag_pheno <- read.csv("ag_f2.csv", row.names=1, stringsAsFactors=FALSE)
alltraits_pheno <- read.csv("alltraits_f2.csv", row.names=1, stringsAsFactors=FALSE)
#Dataset check should equal one
mean(rownames(terp_markers) == rownames(terp_pheno))
mean(rownames(cannabinoid_markers) == rownames(cannabinoid_pheno))
mean(rownames(ag_markers) == rownames(ag_pheno))
mean(rownames(alltraits_markers) == rownames(alltraits_pheno))
#Define y with phenotypic trait data
y = terp_pheno$a.pinene_area
#y = terp_pheno$a_guaiene_area
#y = terp_pheno$s_guaiene_area
#y = terp_pheno$Caryophyllene_area
#y = terp_pheno$Humulene_area
#y = terp_pheno$Limonene_area
#y = terp_pheno$myrcene_area
#Define K as a Guassian Kernel using market dataset. Computes a list of orthogonal kernels containing additive, dominant and first-order epistatic effects.
library(NAM)
K = G2A_Kernels(terp_markers)
K = G2A_Kernels(cannabinoid_markers)
#Load the package that runs variance components, and reformat the Guassian Kernels from above
library(BGLR)
K2 = lapply(K, function(K) list(K=K,model="RKHS") )
#One trait at a time is currently broken
#One trait at a time (in this example, y is the phenotypes)
#FIT1 = BGLR(y, ETA = K2, verbose=FALSE)
#VC = unlist(c(lapply(FIT1$ETA, function(x) x$varU ),Ve=FIT2$varE))
#pie(VC)
#All traits at once (in this example, Y is the matrix of phenotypes)
Y = terp_pheno
Y = cannabinoid_pheno
Y = ag_pheno
Y = alltraits_pheno
##go back and add back in logthccbd removed it from all traits during code testing trying to fix!!!!
#Creating and saving Variance component analysis to estimate different genetic components as well as narrow and broad sense heritabilities
VC_ALL = apply(Y, 2, function(y){FIT = BGLR(y,ETA=K2,verbose=FALSE);return(unlist(c(lapply(FIT$ETA, function(x) x$varU ),Ve=FIT$varE)))})
VC_ALL
write.csv(VC_ALL, file = "VC_ALL.csv")
write.csv(VC_ALL, file = "VC_ALL_cannabinoid.csv")
write.csv(VC_ALL, file = "VC_ALL_ag.csv")
write.csv(VC_ALL, file = "VC_ALL_alltraits.csv")
#Scaling and saving Variance component dataset
VC_ALL_SCALE = apply(VC_ALL, 2, function(x) x/sum(x) )
VC_ALL_SCALE
write.csv(VC_ALL_SCALE, file = "VC_ALL_SCALE_cannabinoid.csv")
write.csv(VC_ALL_SCALE, file = "VC_ALL_SCALE_ag.csv")
#Plot a pie chart of heritability
pie( VC_ALL[, "a.guaiene_area"] )
pie( VC_ALL[, "s.guaiene_area"] )
pie( VC_ALL[, "Caryophyllene_area"] )
pie( VC_ALL[, "Limonene_area"] )
pie( VC_ALL[, "Humulene_area"] )
pie( VC_ALL[, "myrcene_area"] )
pie( VC_ALL[, "THCV"] )
pie( VC_ALL[, "THC"] )
pie( VC_ALL[, "CBD"] )
pie( VC_ALL[, "CBC"] )
pie( VC_ALL[, "CBG"] )
pie( VC_ALL[, "CBN"] )
pie( VC_ALL[, "Total.cannabinoids"] )
pie( VC_ALL[, "Log.THC.CBD"] )
pie( VC_ALL[, "THC.CBD"] )
#Broken unable to download on my version of R
#You can also try variance components with this package
#library(varComp)
#FIT_REML = varComp(y~1,varcov=K)
#Dataset checks; need a numeric dataset without missing data
is.numeric(terp_markers)
anyNA(terp_markers)
#Making and saving K is a list of kinships; K$A is the additive, it represents what is heritable and of interest for breeding
A = K$A
VC_ADD = apply(Y, 2, function(y) reml(y,K=A)$VC)
VC_ADD
write.csv(VC_ADD, file = "VC_ADD.csv")
write.csv(VC_ADD, file = "VC_ADD_cannabinoid.csv")
write.csv(VC_ADD, file = "VC_ADD_ag.csv")
#Dataset check data should be matrix
class(Y)
#Making and saving a correlation table
Y = data.matrix(Y)
GenCor = reml(Y,K=A)
GenCor
save(GenCor, file= "Multivariate_model_results.RData")
save(GenCor, file= "Multivariate_model_results_cannabinoid.RData")
save(GenCor, file= "Multivariate_model_results_ag.RData")
#Saving heritability files as csv format
#write.csv(GenCor$VC$Vg, "Additive_Genetic_Covariance.csv")
#write.csv(GenCor$VC$Ve, "Residual_Covariance.csv")
#write.csv(GenCor$VC$h2, "h2.csv")
write.csv(GenCor$VC$Vg, "Additive_Genetic_Covariance_cannabinoid.csv")
write.csv(GenCor$VC$Ve, "Residual_Covariance_cannabinoid.csv")
write.csv(GenCor$VC$h2, "h2_cannabinoid.csv")
write.csv(GenCor$VC$Vg, "Additive_Genetic_Covariance_ag.csv")
write.csv(GenCor$VC$Ve, "Residual_Covariance_ag.csv")
write.csv(GenCor$VC$h2, "h2_ag.csv")
Vg <- cov2cor(GenCor$VC$Vg)
Vg
write.csv(Vg, "Vg_cannabinoid.csv")
write.csv(Vg, "Vg_ag.csv")
Vg <- read.csv("Additive_Genetic_Covariance.csv", row.names=1, stringsAsFactors=FALSE)
#Dataset check should be zero
mean(is.na(Y))
#Making heat maps of correlation matrices
Vg_ag = read.csv("Vg_ag.csv", header=TRUE, row.names=1)
Vg_ag_matrix = as.matrix(Vg_ag)
heatmap(Vg_ag_matrix)
Vg_cannabinoid = read.csv("Vg_cannabinoid.csv", header=TRUE, row.names=1)
Vg_cannabinoid_matrix = as.matrix(Vg_cannabinoid)
heatmap(Vg_cannabinoid_matrix)
Vg_terpene = read.csv("Vg.csv", header=TRUE, row.names=1)
Vg_terpene_matrix = as.matrix(Vg_terpene)
heatmap(Vg_terpene_matrix)
#Dataset check class should be matrix
class(Vg_ag_matrix)
Vg_ag