Skip to content

Commit 8e23c46

Browse files
committed
Readded part3 script for docker build
1 parent 211b4d4 commit 8e23c46

File tree

1 file changed

+58
-0
lines changed

1 file changed

+58
-0
lines changed
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#!/bin/python
2+
3+
import argparse
4+
from collections import defaultdict
5+
from os import mkdir, path
6+
7+
8+
def count_variants(infile):
9+
variant_counts = defaultdict(int)
10+
with open(infile, 'r') as IN:
11+
for line in IN:
12+
var_id = line.strip().split('\t')[0]
13+
variant_counts[var_id] += 1
14+
return dict(sorted(variant_counts.items(), key=lambda item: item[1], reverse=True))
15+
16+
17+
def assign_shards(variant_counts, max_samples):
18+
shard_assignments = {}
19+
shard_number = 0
20+
sample_counter = 0
21+
first = True
22+
for variant in variant_counts.keys():
23+
if not first and (sample_counter + variant_counts[variant] > max_samples):
24+
shard_number += 1
25+
sample_counter = 0
26+
shard_assignments[variant] = shard_number
27+
sample_counter += variant_counts[variant]
28+
first = False
29+
return shard_number, shard_assignments
30+
31+
32+
def create_shards(infile, shard_assignments, num_shards):
33+
if not path.isdir("./shards"):
34+
mkdir("./shards")
35+
with open(infile, 'r') as IN:
36+
for line in IN:
37+
var_id = line.strip().split('\t')[0]
38+
shard = shard_assignments[var_id]
39+
shard_file = f"shards/out.{shard}_{num_shards}.txt"
40+
with open(shard_file, 'a') as OUT:
41+
OUT.write(line)
42+
43+
44+
def main():
45+
parser = argparse.ArgumentParser()
46+
parser.add_argument("combined_file", help="rd_cn_revise file with variant ID, sample ID, and CN columns")
47+
parser.add_argument("-s", "--max-samples",
48+
help="Maximum number of variant x sample entries in a shard (default = 7,000)",
49+
default=7000, type=int)
50+
args = parser.parse_args()
51+
52+
variant_counts = count_variants(args.combined_file)
53+
num_shards, shard_assignments = assign_shards(variant_counts, args.max_samples)
54+
create_shards(args.combined_file, shard_assignments, num_shards)
55+
56+
57+
if __name__ == "__main__":
58+
main()

0 commit comments

Comments
 (0)