-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathThe relationship between gene expression and GC content.R
More file actions
78 lines (52 loc) · 2.52 KB
/
The relationship between gene expression and GC content.R
File metadata and controls
78 lines (52 loc) · 2.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#### The relationship between gene expression and GC content
TPM <- read.table("/mnt/data5/BGI/UCB/ExpMat_NewID/TPM_mat.txt", header = T, row.names = 1, stringsAsFactors = F)
CellSum <- apply(TPM,1,function(x){sum(x>0)})
GEmean <- apply(TPM,1,function(x){mean(x[x>0])})
sum(CellSum==0)
[1] 10893
GEmean[is.na(GEmean)] <- 0
sum(CellSum>0 & GEmean>0)
[1] 47158
TPM <- TPM[CellSum>0 & GEmean>0,]
CellSum <- apply(TPM,1,function(x){sum(x>0)})
GEmean <- apply(TPM,1,function(x){mean(x[x>0])})
levels <- factor(cut(CellSum,unique(quantile(CellSum, probs=seq(0,1,.01))),include.lowest=T),labels=1:78)
geneLevel <- data.frame(gene = names(CellSum), level = as.character(levels), stringsAsFactors = F)
gtf <- read.table("/mnt/data5/BGI/UCB/tangchao/Homo_sapiens.GRCh38.87.gtf", header = FALSE, sep = "\t", stringsAsFactors = F)
gtf <- gtf[gtf$V3=="gene",]
test <- as.data.frame(do.call(rbind, strsplit(as.character(gtf$V9), split = "[; ]")))
valueFind <- function(x, type = "gene_id"){
for(i in 1:length(x)){
if(sum(x == type) == 0){
return(NA)
}else{
return(x[which(x == type)[1]+1])
}
}
}
gtf_infor <- data.frame(gene_id = as.vector(apply(test,1,valueFind)),
gene_name = as.vector(apply(test,1,function(x){valueFind(x,type = "gene_name")})),
gene_biotype = as.vector(apply(test,1,function(x){valueFind(x,type = "gene_biotype")})), stringsAsFactors=F)
gene_gtf <- cbind(gtf[,1:8],gtf_infor)
GLI <- merge(x = gene_gtf, y = geneLevel, by.x = "gene_id", by.y = "gene", all.y = T)
GLI <- GLI[GLI$V1 %in% c(1:22,"X","Y"),]
library(GenomicRanges)
GR <- GRanges(seqnames = paste("chr", GLI$V1,sep = ""),IRanges(start = GLI$V4, end = GLI$V5), strand = GLI$V7)
library(BSgenome)
available.genomes(splitNameParts=FALSE, type=getOption("pkgType"))
library(BSgenome.Hsapiens.UCSC.hg38)
GR_dna <- getSeq(Hsapiens, GR)
MY_GC <- letterFrequency(GR_dna, "GC", as.prob=TRUE)
GLI$GC <- as.numeric(MY_GC)
GLI$level <- factor(GLI$level, levels = 1:78)
aggregate(GLI$GC,by=list(GLI$level),mean)
aggregate(GLI$GC,by=list(GLI$gene_biotype),mean)
aggregate(GLI$GC,by=list(GLI$level),boxplot)
library(ggplot2)
ggplot(GLI, aes(x=level, y=GC)) +
geom_boxplot(outlier.shape=NA)
biomart_GC <- read.table("/mnt/data5/BGI/UCB/tangchao/GEGC/Gene_GC_content.txt", sep = "\t", header = T)
colnames(biomart_GC) <- c("gene","GC_biomart")
GLI <- merge(x = GLI, y = biomart_GC, by.x = "gene_id", by.y = "gene", all.x = T)
aggregate(GLI$GC,by=list(GLI$level),mean)
aggregate(GLI$GC_biomart,by=list(GLI$level),function(x){mean(x, na.rm = T)})