-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMPH_extract.R
More file actions
59 lines (44 loc) · 2.09 KB
/
MPH_extract.R
File metadata and controls
59 lines (44 loc) · 2.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
args <- commandArgs(trailingOnly = TRUE)
region <- as.character(args[1])
input_start_hxb2 <- as.numeric(args[2])
input_stop_hxb2 <- as.numeric(args[3])
project_folder <- as.character(args[4])
macaques_file <- as.character(args[5])
alignments_folder <- as.character(args[6])
library(seqinr)
macaques <- scan(macaques_file, what = "", sep = "\n")
alignment_files <- paste0(macaques, "_hxb2.fasta")
for (file in alignment_files) {
#get alignment file
file_location <- paste0(alignments_folder, file)
alignment <- read.alignment(file = file_location, format = "fasta", forceToLower = FALSE)
#get hxb2 reference
hxb2_index <- which(sapply(alignment$nam, function(x) grepl("HXB2", x)))
if (length(hxb2_index) == 0) {stop(paste0("HXB2 reference not found in the alignment file: ", file_location, " !!!"))}
hxb2_seq <- alignment$seq[[hxb2_index]]
#define region of interest (hxb2 numbering)
start_hxb2 <- input_start_hxb2
stop_hxb2 <- input_stop_hxb2
#convert hxb2 numbering to alignment numbering
cumulative_non_gaps <- numeric(nchar(hxb2_seq))
total_non_gaps <- 0
for (i in 1:nchar(hxb2_seq)) {
if (substr(hxb2_seq, i, i) != "-") {
total_non_gaps <- total_non_gaps + 1
}
cumulative_non_gaps[i] <- total_non_gaps
}
if(total_non_gaps != 709) {print(paste("Warning: wrong length for HXB2 in", file))} #Adjust for legnth of reference sequence used
for (i in 1:length(cumulative_non_gaps)) {
if(start_hxb2 == cumulative_non_gaps[i]) {start <- as.integer(i)}
if(stop_hxb2 == cumulative_non_gaps[i]) {stop <- as.integer(i)}
}
#extract regions of interest
trimmed_regions <- sapply(alignment$seq, function(seq) paste(substr(seq, start, stop), collapse=""))
#save new alignment
output_file <- paste0(sub("_.*", "", file), "_", region, ".fasta")
output_folder <- paste0(project_folder, region, "/")
if(!dir.exists(project_folder)) {dir.create(project_folder)}
if(!dir.exists(output_folder)) {dir.create(output_folder)}
write.fasta(sequences = as.list(trimmed_regions), names = alignment$nam, file.out = paste0(output_folder, output_file))
}