1+ #!/bin/python
2+
3+ import argparse
4+ from collections import defaultdict
5+ from os import mkdir , path
6+
7+
8+ def count_variants (infile ):
9+ variant_counts = defaultdict (int )
10+ with open (infile , 'r' ) as IN :
11+ for line in IN :
12+ var_id = line .strip ().split ('\t ' )[0 ]
13+ variant_counts [var_id ] += 1
14+ return dict (sorted (variant_counts .items (), key = lambda item : item [1 ], reverse = True ))
15+
16+
17+ def assign_shards (variant_counts , max_samples ):
18+ shard_assignments = {}
19+ shard_number = 0
20+ sample_counter = 0
21+ first = True
22+ for variant in variant_counts .keys ():
23+ if not first and (sample_counter + variant_counts [variant ] > max_samples ):
24+ shard_number += 1
25+ sample_counter = 0
26+ shard_assignments [variant ] = shard_number
27+ sample_counter += variant_counts [variant ]
28+ first = False
29+ return shard_number , shard_assignments
30+
31+
32+ def create_shards (infile , shard_assignments , num_shards ):
33+ if not path .isdir ("./shards" ):
34+ mkdir ("./shards" )
35+ with open (infile , 'r' ) as IN :
36+ for line in IN :
37+ var_id = line .strip ().split ('\t ' )[0 ]
38+ shard = shard_assignments [var_id ]
39+ shard_file = f"shards/out.{ shard } _{ num_shards } .txt"
40+ with open (shard_file , 'a' ) as OUT :
41+ OUT .write (line )
42+
43+
44+ def main ():
45+ parser = argparse .ArgumentParser ()
46+ parser .add_argument ("combined_file" , help = "rd_cn_revise file with variant ID, sample ID, and CN columns" )
47+ parser .add_argument ("-s" , "--max-samples" ,
48+ help = "Maximum number of variant x sample entries in a shard (default = 7,000)" ,
49+ default = 7000 , type = int )
50+ args = parser .parse_args ()
51+
52+ variant_counts = count_variants (args .combined_file )
53+ num_shards , shard_assignments = assign_shards (variant_counts , args .max_samples )
54+ create_shards (args .combined_file , shard_assignments , num_shards )
55+
56+
57+ if __name__ == "__main__" :
58+ main ()
0 commit comments