Skip to content

Commit 4b085a3

Browse files
committed
add SRA sequence download and processing pipeline
1 parent 9ff535c commit 4b085a3

File tree

10 files changed

+472
-336
lines changed

10 files changed

+472
-336
lines changed

.gitignore

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,20 @@
1+
2+
# large data
13
gen_data/raw_pred_extrapolation_wu.csv
4+
fastq_files/*fastq
5+
merged_reads
6+
7+
# ignore any temporary fasterq directories just in case
8+
fastq_files/fasterq*
9+
10+
# miscellaneous and extraneous files
211
designs_counts_original.csv
312
reformat_df.py
4-
#pretrained_models/*
513
data/olson_pred_fitnesses.csv
14+
615
.DS_Store
716
.ipynb_checkpoints/
817
__pycache__/
18+
19+
# ignore any temporary directories
20+
tmp

03_design_experimental_analysis.ipynb

Lines changed: 98 additions & 71 deletions
Large diffs are not rendered by default.

03_preprocessing.py

Lines changed: 53 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
import math
22
import numpy as np
3+
from tqdm import tqdm
34
from os.path import join
45
from os import listdir
56
import pandas as pd
67
import sys
8+
import subprocess
79

810
def get_read_data(all_data):
911
'''
@@ -67,68 +69,88 @@ def process_read_pair(fwd_all_data, rev_all_data):
6769
input_seq_df: filed that should contain a column called 'dna_seq' which has the nucleotide seq we want to match
6870
output_counts_file: where to output the counts for each NGS file
6971
'''
70-
7172
_, input_directory, date, merged_reads_directory, input_seq_df, output_counts_file = sys.argv
7273
seq_df = pd.read_csv(input_seq_df)
7374

74-
file_names = listdir(input_directory)
75-
file_name_pairs = []
76-
for file_name in file_names:
77-
if '.fastq' in file_name and 'R1' in file_name:
78-
file_name_match = 'R2'.join(file_name.split('R1'))
79-
if file_name in file_names:
80-
file_name_pairs.append([file_name, file_name_match])
81-
print('Found pair of file reads {}, {}'.format(file_name, file_name_match))
82-
else:
83-
print('Unable to find match for {}'.format(file_name))
84-
85-
merged_reads_output_names = ['both_reads'.join(file_name_pair[0].split('R1')) for file_name_pair in file_name_pairs]
86-
read_descriptors = [file_name_pair[0].split('R1')[0] + date for file_name_pair in file_name_pairs]
87-
88-
for file_name_pair, merged_reads_output_name in zip(file_name_pairs, merged_reads_output_names):
89-
print('Processing {}, {}'.format(*file_name_pair))
75+
# read file names
76+
paired_file_paths = pd.read_csv(join(input_directory, 'sra_file_pairs.csv'))
77+
78+
# remove date variable from dataframe columns - makes count file more interchangeable
79+
merged_reads_output_names = ['both_reads'.join(file.split('R1')) for file in paired_file_paths.R1]
80+
read_descriptors = [file.split('R1')[0] for file in paired_file_paths.R1]
81+
82+
# save output stats here to print report when done
83+
output_stats = []
84+
85+
# explicitly identify r1 and r2 files while keeping with `file_name_pair` naming scheme below
86+
for r1_file, r2_file, merged_reads_output_name in tqdm(zip(paired_file_paths.R1, paired_file_paths.R2, merged_reads_output_names), total=len(paired_file_paths), ncols=100, leave=True, desc='File'):
87+
file_name_pair = [r1_file, r2_file]
9088
read_count = 0
9189
fwd_all_data = ''
9290
rev_all_data = ''
93-
# open each pair of files
91+
92+
# open read files
9493
f1 = open(join(input_directory, file_name_pair[0]), 'r')
9594
f2 = open(join(input_directory, file_name_pair[1]), 'r')
96-
# overwrite existing merged_read_file
95+
96+
# write empty merged read file
9797
with open(join(merged_reads_directory, merged_reads_output_name), 'w') as f:
9898
f.write('')
99+
99100
# open in appending mode
100101
out_file = open(join(merged_reads_directory, merged_reads_output_name), 'a')
101-
for l1, l2 in zip(f1, f2):
102-
# process and write the read
102+
for l1, l2 in tqdm(zip(f1, f2), leave=False, desc='Processing reads'):
103+
# if a new read description is identified and compiled fwd/rev_all_data is not '',
104+
# then fwd/rev_all_data is complete read. Process and identify read.
103105
if l1[0] == '@' and l2[0] == '@' and fwd_all_data != '' and rev_all_data != '':
104106
final_sequence = process_read_pair(fwd_all_data, rev_all_data)
105107
out_file.write(final_sequence + '\n')
106108
read_count += 1
109+
110+
# reset fwd/rev_all_data to ''
107111
fwd_all_data = ''
108112
rev_all_data = ''
109-
# append to the new line
113+
114+
# append fastq data to fwd/rev_all_data until complete read data is compiled
115+
# i.e. read descriptor, sequence, description, and quality scores.
110116
fwd_all_data += l1
111117
rev_all_data += l2
112-
# process and write final read
118+
119+
# for loop does not process last read in file - do that here
113120
final_sequence = process_read_pair(fwd_all_data, rev_all_data)
114121
out_file.write(final_sequence + '\n')
115122
read_count += 1
116-
print('Merged {} reads'.format(read_count))
117123

124+
# save output stats
125+
output_stats.append((*file_name_pair, read_count))
126+
127+
# close files
128+
f1.close()
129+
f2.close()
130+
out_file.close()
131+
132+
# print output_stats
133+
print('All reads filtered for quality scores, final read counts:')
134+
for r1_file, r2_file, read_cnt in output_stats:
135+
print(f"Experiment: {r1_file.split('R1')[0]} -- Total reads: {read_cnt}")
136+
137+
print('Identifying reads in filtered fastq files...')
118138
unsorted_counts = None
119-
for merged_reads_output_name, read_descriptor in zip(merged_reads_output_names, read_descriptors):
139+
for merged_reads_output_name, read_descriptor in tqdm(zip(merged_reads_output_names, read_descriptors), total=len(paired_file_paths), ncols=100, leave=True, desc='File'):
120140
# open merged reads again
121141
with open(join(merged_reads_directory, merged_reads_output_name), 'r') as f:
122142
sequences = f.read().split('\n')
123-
seq_ids = seq_df['seq_ID'].values.tolist()
124143
possible_sequences = seq_df['dna_seq'].values.tolist()
125-
# calculate counts (add a count to a possible sequence iff the read matches exactly)
144+
145+
# calculate counts (add a count to a possible sequence if the read matches exactly)
146+
# counts index matches that of sequence index
126147
counts = [0 for _ in possible_sequences]
127148
print('Determining counts for {} reads from {} dataset'.format(len(sequences), read_descriptor))
128149
for i, sequence in enumerate(sequences):
129-
sequence = sequence[58:223] # this portion will also be unique to each protein library
150+
sequence = sequence[58:223] # this is the unique protein sequence in the read
130151
if sequence in possible_sequences:
131152
counts[possible_sequences.index(sequence)] += 1
132-
seq_df[read_descriptor + '_count'] = counts
133-
134-
seq_df.to_csv(output_counts_file, index=False)
153+
seq_df[read_descriptor + 'count'] = counts
154+
155+
# add date here to distinguish different counts file outputs
156+
seq_df.to_csv(date+'_'+output_counts_file, index=False)

Makefile

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,20 @@
1-
DATE=$(date +'%m%d%Y')
1+
DATE=$(shell date +%m-%d-%Y)
22

3-
all: preprocess
4-
echo "Done"
3+
env:
4+
conda env create -f environment.yml
5+
conda activate gb1_inf
56

67
extrapolation:
78
python 01_extrapolation_predictions.py
89

9-
preprocess: download # check to see if this is the best way to specify to download if folders do not exist
10+
process_sequencing:# sra_download
11+
echo "${DATE}"
1012
mkdir merged_reads
11-
for d in fastq_files/ ; do
12-
python preprocess.py fastq_files/${d} ${d:0:6} merged_reads/ designs.csv designs_counts.csv
13+
python 03_preprocessing.py fastq_files ${DATE} merged_reads/ designs.csv designs_counts.csv
1314

14-
download:
15-
mkdir fastq_files
15+
sra_download:
1616
# Download fastq files from NCBI SRA
17-
echo "Not implemented yet"
17+
echo "Total download size for SRA data is ~56 GB"
18+
cd fastq_files; \
19+
python download_sra_data.py
1820

0 commit comments

Comments
 (0)