|
1 | 1 | import math |
2 | 2 | import numpy as np |
| 3 | +from tqdm import tqdm |
3 | 4 | from os.path import join |
4 | 5 | from os import listdir |
5 | 6 | import pandas as pd |
6 | 7 | import sys |
| 8 | +import subprocess |
7 | 9 |
|
8 | 10 | def get_read_data(all_data): |
9 | 11 | ''' |
@@ -67,68 +69,88 @@ def process_read_pair(fwd_all_data, rev_all_data): |
67 | 69 | input_seq_df: filed that should contain a column called 'dna_seq' which has the nucleotide seq we want to match |
68 | 70 | output_counts_file: where to output the counts for each NGS file |
69 | 71 | ''' |
70 | | - |
71 | 72 | _, input_directory, date, merged_reads_directory, input_seq_df, output_counts_file = sys.argv |
72 | 73 | seq_df = pd.read_csv(input_seq_df) |
73 | 74 |
|
74 | | - file_names = listdir(input_directory) |
75 | | - file_name_pairs = [] |
76 | | - for file_name in file_names: |
77 | | - if '.fastq' in file_name and 'R1' in file_name: |
78 | | - file_name_match = 'R2'.join(file_name.split('R1')) |
79 | | - if file_name in file_names: |
80 | | - file_name_pairs.append([file_name, file_name_match]) |
81 | | - print('Found pair of file reads {}, {}'.format(file_name, file_name_match)) |
82 | | - else: |
83 | | - print('Unable to find match for {}'.format(file_name)) |
84 | | - |
85 | | - merged_reads_output_names = ['both_reads'.join(file_name_pair[0].split('R1')) for file_name_pair in file_name_pairs] |
86 | | - read_descriptors = [file_name_pair[0].split('R1')[0] + date for file_name_pair in file_name_pairs] |
87 | | - |
88 | | - for file_name_pair, merged_reads_output_name in zip(file_name_pairs, merged_reads_output_names): |
89 | | - print('Processing {}, {}'.format(*file_name_pair)) |
| 75 | + # read file names |
| 76 | + paired_file_paths = pd.read_csv(join(input_directory, 'sra_file_pairs.csv')) |
| 77 | + |
| 78 | + # remove date variable from dataframe columns - makes count file more interchangeable |
| 79 | + merged_reads_output_names = ['both_reads'.join(file.split('R1')) for file in paired_file_paths.R1] |
| 80 | + read_descriptors = [file.split('R1')[0] for file in paired_file_paths.R1] |
| 81 | + |
| 82 | + # save output stats here to print report when done |
| 83 | + output_stats = [] |
| 84 | + |
| 85 | + # explicitly identify r1 and r2 files while keeping with `file_name_pair` naming scheme below |
| 86 | + for r1_file, r2_file, merged_reads_output_name in tqdm(zip(paired_file_paths.R1, paired_file_paths.R2, merged_reads_output_names), total=len(paired_file_paths), ncols=100, leave=True, desc='File'): |
| 87 | + file_name_pair = [r1_file, r2_file] |
90 | 88 | read_count = 0 |
91 | 89 | fwd_all_data = '' |
92 | 90 | rev_all_data = '' |
93 | | - # open each pair of files |
| 91 | + |
| 92 | + # open read files |
94 | 93 | f1 = open(join(input_directory, file_name_pair[0]), 'r') |
95 | 94 | f2 = open(join(input_directory, file_name_pair[1]), 'r') |
96 | | - # overwrite existing merged_read_file |
| 95 | + |
| 96 | + # write empty merged read file |
97 | 97 | with open(join(merged_reads_directory, merged_reads_output_name), 'w') as f: |
98 | 98 | f.write('') |
| 99 | + |
99 | 100 | # open in appending mode |
100 | 101 | out_file = open(join(merged_reads_directory, merged_reads_output_name), 'a') |
101 | | - for l1, l2 in zip(f1, f2): |
102 | | - # process and write the read |
| 102 | + for l1, l2 in tqdm(zip(f1, f2), leave=False, desc='Processing reads'): |
| 103 | + # if a new read description is identified and compiled fwd/rev_all_data is not '', |
| 104 | + # then fwd/rev_all_data is complete read. Process and identify read. |
103 | 105 | if l1[0] == '@' and l2[0] == '@' and fwd_all_data != '' and rev_all_data != '': |
104 | 106 | final_sequence = process_read_pair(fwd_all_data, rev_all_data) |
105 | 107 | out_file.write(final_sequence + '\n') |
106 | 108 | read_count += 1 |
| 109 | + |
| 110 | + # reset fwd/rev_all_data to '' |
107 | 111 | fwd_all_data = '' |
108 | 112 | rev_all_data = '' |
109 | | - # append to the new line |
| 113 | + |
| 114 | + # append fastq data to fwd/rev_all_data until complete read data is compiled |
| 115 | + # i.e. read descriptor, sequence, description, and quality scores. |
110 | 116 | fwd_all_data += l1 |
111 | 117 | rev_all_data += l2 |
112 | | - # process and write final read |
| 118 | + |
| 119 | + # for loop does not process last read in file - do that here |
113 | 120 | final_sequence = process_read_pair(fwd_all_data, rev_all_data) |
114 | 121 | out_file.write(final_sequence + '\n') |
115 | 122 | read_count += 1 |
116 | | - print('Merged {} reads'.format(read_count)) |
117 | 123 |
|
| 124 | + # save output stats |
| 125 | + output_stats.append((*file_name_pair, read_count)) |
| 126 | + |
| 127 | + # close files |
| 128 | + f1.close() |
| 129 | + f2.close() |
| 130 | + out_file.close() |
| 131 | + |
| 132 | + # print output_stats |
| 133 | + print('All reads filtered for quality scores, final read counts:') |
| 134 | + for r1_file, r2_file, read_cnt in output_stats: |
| 135 | + print(f"Experiment: {r1_file.split('R1')[0]} -- Total reads: {read_cnt}") |
| 136 | + |
| 137 | + print('Identifying reads in filtered fastq files...') |
118 | 138 | unsorted_counts = None |
119 | | - for merged_reads_output_name, read_descriptor in zip(merged_reads_output_names, read_descriptors): |
| 139 | + for merged_reads_output_name, read_descriptor in tqdm(zip(merged_reads_output_names, read_descriptors), total=len(paired_file_paths), ncols=100, leave=True, desc='File'): |
120 | 140 | # open merged reads again |
121 | 141 | with open(join(merged_reads_directory, merged_reads_output_name), 'r') as f: |
122 | 142 | sequences = f.read().split('\n') |
123 | | - seq_ids = seq_df['seq_ID'].values.tolist() |
124 | 143 | possible_sequences = seq_df['dna_seq'].values.tolist() |
125 | | - # calculate counts (add a count to a possible sequence iff the read matches exactly) |
| 144 | + |
| 145 | + # calculate counts (add a count to a possible sequence if the read matches exactly) |
| 146 | + # counts index matches that of sequence index |
126 | 147 | counts = [0 for _ in possible_sequences] |
127 | 148 | print('Determining counts for {} reads from {} dataset'.format(len(sequences), read_descriptor)) |
128 | 149 | for i, sequence in enumerate(sequences): |
129 | | - sequence = sequence[58:223] # this portion will also be unique to each protein library |
| 150 | + sequence = sequence[58:223] # this is the unique protein sequence in the read |
130 | 151 | if sequence in possible_sequences: |
131 | 152 | counts[possible_sequences.index(sequence)] += 1 |
132 | | - seq_df[read_descriptor + '_count'] = counts |
133 | | - |
134 | | - seq_df.to_csv(output_counts_file, index=False) |
| 153 | + seq_df[read_descriptor + 'count'] = counts |
| 154 | + |
| 155 | + # add date here to distinguish different counts file outputs |
| 156 | + seq_df.to_csv(date+'_'+output_counts_file, index=False) |
0 commit comments