-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path11_individual_serums_normalize_data.R
190 lines (151 loc) · 8.43 KB
/
11_individual_serums_normalize_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#### PREPARATION STEPS ####
## You can run this code as it is to process a small subset of proteins, or you can follow these next steps to analyze the entire dataset.
## 1. In chagastope_data/inputs/11_individual_serums_array_design place the "Supplementary File S09 - Mapping of CHAGASTOPE-v2 data to T cruzi proteins.tsv" file (links in the paper)
## 2. In chagastope_data/inputs/12_individual_serums_raw_data place each of the raw data files from CHAGASTOPE-v2 (such as AR_P1_PO_raw.tsv) downloaded from Array Express (https://www.ebi.ac.uk/arrayexpress/experiments/E-MTAB-11655/)
## 3. Set the "testing" variable in the config below to FALSE, or run this code with the "-test F" argument
## 4. If you are running this code in Rstudio, set the "main_folder" variable in the config below to the folder containing this code
#### WARNINGS ####
## This code uses large amounts of RAM
#### CONFIG ####
main_folder <- "." #When running in Rstudio, set this to the absolute path of the folder containing this code
testing <- TRUE #set this to FALSE when running the actual data
#### READ ARGUMENTS AND GET PATH (DO NOT CHANGE) ####
args <- commandArgs(TRUE)
if (length(args == 2)) {
if (args[1] == "-test") {
testing <- as.logical(args[2])
}
}
if (testing == TRUE) {
#For testing
project_folder <- sprintf("%s/test_data", main_folder)
} else {
#For running the actual data
project_folder <- sprintf("%s/chagastope_data", main_folder)
}
#### INTERNAL CONFIG (DO NOT CHANGE) ####
library(data.table)
library(preprocessCore) #quantile.normalization
design_data_file <- sprintf("%s/inputs/11_individual_serums_array_design/Supplementary File S09 - Mapping of CHAGASTOPE-v2 data to T cruzi proteins.tsv", project_folder)
raw_data_folder <- sprintf("%s/inputs/12_individual_serums_raw_data", project_folder)
sources <- c("AR_P1", "AR_P2", "AR_P3", "AR_P4", "AR_P5", "AR_P6", "BO_P1", "BO_P2", "BO_P3", "BO_P4", "BO_P5", "BO_P6", "BR_P1", "BR_P2", "BR_P3", "BR_P4", "BR_P5", "CO_P1", "CO_P2",
"CO_P3", "CO_P4", "MX_P1", "MX_P2", "MX_P3", "MX_P4", "MX_P5", "MX_P6", "US_P1", "US_P2", "US_P3", "US_P4", "US_P5", "US_P6", "AR_E1", "AR_E2", "AR_E3", "AR_E4", "AR_E5",
"AR_E6", "BO_E1", "BO_E2", "BO_E3", "BO_E4", "BO_E5", "BO_E6", "BR_E1", "BR_E2", "BR_E3", "BR_E4", "BR_E5", "BR_E6", "BR_E7", "CO_E1", "CO_E2", "CO_E3", "CO_E4", "CO_E5",
"CO_E6", "CO_E7", "MX_E1", "MX_E2", "MX_E3", "MX_E4", "MX_E5", "MX_E6", "US_E1", "US_E2", "US_E3", "US_E4", "US_E5", "US_E6")
types <- c("PO")
output_signal_decimals <- 2
output_statistics_mode_decimals <- 2
output_statistics_sd_decimals <- 2
output_folder <- sprintf("%s/outputs/11_individual_serums_normalized_data", project_folder)
output_suffix <- "_processed.tsv"
#### AUXILIAR FUNCTIONS ####
calculateMode <- function(x, decimals = 0) {
x <- round(x, decimals)
ux <- unique(x)
ux[which.max(tabulate(match(x, ux)))]
}
#### NORMALIZE ALL DATA ####
#Get data
output_initialized <- F
for (source_for in sources) {
# source_for <- sources[1]
for (type_for in types) {
# type_for <- types[1]
raw_data_file <- sprintf("%s/%s_%s_raw.tsv", raw_data_folder, source_for, type_for)
raw_data <- fread(raw_data_file, header = T, sep = "\t", na.strings = NULL)
setnames(raw_data, "allData.replica1", sprintf("%s_%s_rawData_allData_r1", source_for, type_for))
setnames(raw_data, "allData.replica2", sprintf("%s_%s_rawData_allData_r2", source_for, type_for))
#Check if I need to initialize the tables to normalize
if (output_initialized == F) {
full_raw_PO_data <- raw_data[, .(`Reporter Name`)]
output_initialized <- T
}
#Add the data to the corresponding table
full_raw_PO_data <- merge(full_raw_PO_data,
raw_data,
by = "Reporter Name")
}
}
#Normalize the positive data
matrix_PO_aux <- normalize.quantiles(as.matrix(full_raw_PO_data[, -c("Reporter Name")]))
matrix_PO_aux <- round(matrix_PO_aux, digits = output_signal_decimals)
normalized_PO_data <- as.data.table(matrix_PO_aux)
colnames(normalized_PO_data) <- colnames(full_raw_PO_data[, -c("Reporter Name")])
normalized_PO_data$`Reporter Name` <- full_raw_PO_data$`Reporter Name`
rm(full_raw_PO_data)
rm(matrix_PO_aux)
gc()
#Combine data
colnames(normalized_PO_data) <- gsub("rawData", "normalizedData", colnames(normalized_PO_data))
full_normalized_data <- normalized_PO_data
rm(normalized_PO_data)
gc()
#### NORMALIZE ONLY REGION DATA ####
design_data <- fread(design_data_file, header = T, sep = "\t", na.strings = NULL)
ids_to_keep <- unique(design_data[group == "antigenic_regions_Tcruzi"]$array_express_id)
#Get data
output_initialized <- F
for (source_for in sources) {
# source_for <- sources[1]
for (type_for in types) {
# type_for <- types[1]
raw_data_file <- sprintf("%s/%s_%s_raw.tsv", raw_data_folder, source_for, type_for)
raw_data <- fread(raw_data_file, header = T, sep = "\t", na.strings = NULL)
raw_data <- raw_data[`Reporter Name` %in% ids_to_keep]
setnames(raw_data, "allData.replica1", sprintf("%s_%s_rawData_onlyRegionsData_r1", source_for, type_for))
setnames(raw_data, "allData.replica2", sprintf("%s_%s_rawData_onlyRegionsData_r2", source_for, type_for))
#Check if I need to initialize the tables to normalize
if (output_initialized == F) {
full_raw_PO_data <- raw_data[, .(`Reporter Name`)]
output_initialized <- T
}
#Add the data to the corresponding table
full_raw_PO_data <- merge(full_raw_PO_data,
raw_data,
by = "Reporter Name")
}
}
#Normalize the positive data
matrix_PO_aux <- normalize.quantiles(as.matrix(full_raw_PO_data[, -c("Reporter Name")]))
matrix_PO_aux <- round(matrix_PO_aux, digits = output_signal_decimals)
normalized_PO_data <- as.data.table(matrix_PO_aux)
colnames(normalized_PO_data) <- colnames(full_raw_PO_data[, -c("Reporter Name")])
normalized_PO_data$`Reporter Name` <- full_raw_PO_data$`Reporter Name`
rm(full_raw_PO_data)
rm(matrix_PO_aux)
gc()
#Combine data
colnames(normalized_PO_data) <- gsub("rawData", "normalizedData", colnames(normalized_PO_data))
full_normalized_data <- merge(full_normalized_data,
normalized_PO_data,
by = "Reporter Name",
all.x = T)
#### CALCULATE GLOBAL STATISTICS (FROM THE ONLY REGION NORMALIZATION) ####
cols_to_select <- setdiff(colnames(normalized_PO_data), "Reporter Name")
global_signals <- as.vector(as.matrix(normalized_PO_data[, cols_to_select, with = F]))
mode_aux <- calculateMode(global_signals, decimals = output_statistics_mode_decimals)
sd_aux <- round(sd(global_signals), output_statistics_sd_decimals)
global_statistics <- data.table(mode = mode_aux,
sd = sd_aux)
rm(normalized_PO_data)
rm(global_signals)
gc()
#### SAVE DATA ####
global_statistics_output_file <- sprintf("%s/global_statistics.tsv", output_folder)
write.table(global_statistics, file = global_statistics_output_file, col.names = T, row.names = F, sep = "\t", quote = T)
for (source_for in sources) {
# source_for <- sources[1]
for (type_for in types) {
# type_for <- types[1]
cols_to_select <- colnames(full_normalized_data)
cols_to_select <- cols_to_select[grepl(sprintf("%s_%s", source_for, type_for), cols_to_select)]
cols_to_select <- c("Reporter Name", cols_to_select)
normalized_data_for <- full_normalized_data[, cols_to_select, with = F]
setnames(normalized_data_for, sprintf("%s_%s_normalizedData_allData_r1", source_for, type_for), "allData.replica1")
setnames(normalized_data_for, sprintf("%s_%s_normalizedData_allData_r2", source_for, type_for), "allData.replica2")
setnames(normalized_data_for, sprintf("%s_%s_normalizedData_onlyRegionsData_r1", source_for, type_for), "onlyRegionsData.replica1")
setnames(normalized_data_for, sprintf("%s_%s_normalizedData_onlyRegionsData_r2", source_for, type_for), "onlyRegionsData.replica2")
normalized_data_output_file <- sprintf("%s/%s_%s%s", output_folder, source_for, type_for, output_suffix)
write.table(normalized_data_for, file = normalized_data_output_file, col.names = T, row.names = F, sep = "\t", quote = T, na = "")
}
}