mGWAS/amm_gwas_vesto_v3.R at main · mabro7766/mGWAS · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
##############################################################################################################################################
###   AMM   -- R script for GWAS corecting for population structure (similar to EMMAX and P3D)
###
#######
#
##
##
##
#
#source('emma.r')

##REQUIRED DATA & FORMAT
#
#PHENOTYPE - Y: a n by k+1 matrix, where n=number of individuals
#               k number of phenotypes
#               1st column contains the ecotype ids

#GENOTYPE - X: a n by m matrix, where n=number of individuals, m=number of SNPs, with rownames(X)=individual names, and colnames(X)=SNP names
#KINSHIP - K: a n by n matrix, with rownames(K)=colnames(K)=individual names, you can calculate K as IBS matrix using the emma package K<-emma.kinship(t(X))
#each of these data being sorted in the same way, according to the individual name
#
#
#SNP INFORMATION - SNP_INFO: a data frame having at least 3 columns:
# - 1 named 'SNP', with SNP names (same as colnames(X)),
# - 1 named 'Chr', with the chromosome number to which belong each SNP
# - 1 named 'Pos', with the position of the SNP onto the chromosome it belongs to.
#######

amm_gwas_vesto_v3<-function(Y,X,K,p=0.001,n=2,run=T,calculate.effect.size=FALSE,include.lm=FALSE,use.SNP_INFO=FALSE,report=T)
  {

  stopifnot(is.numeric(Y[,1]))
  Y_<-Y[order(Y[,1]),]
  Y<-as.matrix(Y_[,n])
  rownames(Y)<-Y_[,1]
  colnames(Y)<-colnames(Y_)[n]
  Y<-na.omit(Y)
  XX<-X[rownames(X) %in% rownames(Y),]

  if (report==T) {cat ('GWAS performed on', length(which(rownames(Y)%in%rownames(XX))),'ecotypes, ', nrow(Y)-length(which(rownames(Y)%in%rownames(XX))),'values excluded','\n')}


  if (use.SNP_INFO==FALSE){
  options(stringsAsFactors = FALSE)

  if (report==T) {cat('SNP_INFO file created','\n')}
  SNP_INFO <<-data.frame(cbind(colnames(X),matrix(nrow=ncol(X),ncol=2,data=unlist(strsplit(colnames(X),split='_')),byrow=T)))
  colnames(SNP_INFO) <<-c('SNP','Chr','Pos')
  #SNP_INFO[,2] <<-as.numeric(levels(SNP_INFO[,2]))[SNP_INFO[,2]]
  #SNP_INFO[,3] <<-as.numeric(levels(SNP_INFO[,3]))[SNP_INFO[,3]]
  SNP_INFO[,2]<<-as.integer(paste(SNP_INFO[,2]))
  SNP_INFO[,3]<<-as.integer(paste(SNP_INFO[,3]))
  } else { if (report==T) {cat('User definied SNP_INFO file is used','\n')}}


  Y1<-as.matrix(Y[rownames(Y) %in% rownames(XX),])
  colnames(Y1)<-colnames(Y)
  rownames(Y1)<-rownames(Y)[rownames(Y)%in%rownames(XX)]
  ecot_id<-as.integer(rownames(Y1))


  K1<-K[rownames(K) %in% ecot_id,]

  K2<-K1[,colnames(K1) %in% ecot_id]

  K_ok<-as.matrix(K2)

# The formula was developed by Bjarni and
# goes back to the description in the original EMMAX paper (Kang et al. 2008, Genetics)
# The reason to scale the K-matrix is to be able to interpret the effect size.
# it does not change the p-values
# upon comparing with K and 2K, I see only slight changes starting from the 7th decimal  in the SNP effects (beta's)
# I do see a change in herit=vg/(vg+ve)
# best to run a full model on the most important SNPs (done by GWAPP)

  a<-rownames(K_ok)

#  n<-length(a)
#  K_stand<-(n-1)/sum((diag(n)-matrix(1,n,n)/n)*K_ok)*K_ok

  K_stand = 2*K_ok

  Y<-Y1[which(rownames(Y1)%in%a),]
  X_<-XX[which(rownames(XX)%in%a),]

  rm(X,XX)
  gc()


  AC_1 <- data.frame(colnames(X_),apply(X_,2,sum))
  colnames(AC_1)<-c('SNP','AC_1')

  MAF_1<-data.frame(AC_1,AC_0=nrow(X_)-AC_1$AC_1)

  MAF_2<-data.frame(MAF_1,MAC=apply(MAF_1[,2:3],1,min))

  MAF_3<-data.frame(MAF_2,MAF=(MAF_2$MAC/nrow(X_)))

  MAF_ok<-merge(SNP_INFO,MAF_3,by='SNP')

  rm(AC_1,MAF_1,MAF_2,MAF_3)

  #Filter for MAF

  MAF<-subset(MAF_ok,MAF<p)[,1]

  X_ok<<-X_[,!colnames(X_) %in% MAF]

  rm(MAF,X_)

  #REML

  Xo<-rep(1,nrow(X_ok))
  ex<-as.matrix(Xo)


  null<-emma.REMLE(Y,ex,K_stand)

  herit<<-null$vg/(null$vg+null$ve)
  if (report==T) {cat('pseudo-heritability estimate is ',herit,'\n')}

  if (run==FALSE) {
  h<<-herit
  cat('no GWAS performed','\n') } else {

      M<-solve(chol(null$vg*K_stand+null$ve*diag(dim(K_stand)[1])))


    Y_t<-crossprod(M,Y)

    int_t<-crossprod(M,(rep(1,length(Y))))

    if (calculate.effect.size==T)
       {
       models1<-apply(X_ok,2,function(x){summary(lm(Y_t~0+int_t+crossprod(M,x)))$coeff[2,]})
       out_models<<-data.frame(SNP=colnames(models1),Pval=models1[4,],beta=models1[1,])
       } else {
        #EMMAX SCAN
        RSS_env<-rep(sum(lsfit(int_t,Y_t,intercept = FALSE)$residuals^2),ncol(X_ok))
        R1_full<-apply(X_ok,2,function(x){sum(lsfit(cbind(int_t,crossprod(M,x)),Y_t,intercept = FALSE)$residuals^2)})
        m<-nrow(Y1)

        F_1<-((RSS_env-R1_full)/1)/(R1_full/(m-3))
        pval_Y1<-pf(F_1,1,(m-3),lower.tail=FALSE)

        snp<-colnames(X_ok)
        out_models<-data.frame(SNP=snp,Pval=pval_Y1)
        }

  output<<-merge(MAF_ok,out_models,by='SNP')
  ## sort by Chr and Pos
  output.sort <<- output[with(output, order(Chr,Pos)),]
}


}