-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathresults_aggregation_script.py
81 lines (67 loc) · 3.33 KB
/
results_aggregation_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#!/usr/bin/env python3
import sys
import pandas as pd
import argparse
import json
parser = argparse.ArgumentParser()
parser.add_argument('--json_files', nargs='+', required=True, help='List of JSON result files')
parser.add_argument('--rel_abu_threshold', type=float, required=True, help='Relative abundance threshold (0–100)')
parser.add_argument('--output', type=str, required=True, help='Path to output file')
parser.add_argument('--itsx_used', type=int, required=True, help='Set to 1 if ITSx was used. Otherwise set to 0')
args = parser.parse_args()
empty_json_dfs = []
json_dfs = []
for json_path in args.json_files:
with open(f'{json_path}', 'r') as f:
json_data = json.load(f)
print(json_path)
# manage nonprocessed barcodes
if "cluster_data" not in json_data:
empty_data = [{
"Barcode": json_data["barcode_id"],
"Number of clusters": json_data["message"]
}]
empty_df = pd.DataFrame(empty_data)
empty_json_dfs.append(empty_df)
continue
clusters_data = []
for cluster in json_data["cluster_data"]:
clusters_data.append({
"Cluster ID": cluster["cluster_id"],
"Cluster size": cluster["cluster_size"],
"Cluster relative abundance": cluster["relative_abundance"] * 100,
"Cluster sequence": cluster["cluster_sequence"],
"Cluster sequence untrimmed": cluster["cluster_sequence_untrimmed"],
"BLASTn taxonomy assignment": cluster["blastn_tax_name"],
"BLASTn perc. ident.": cluster["blastn_pident"],
"BLASTn query coverage": cluster["blastn_query_coverage"],
"BLASTn query length": cluster["blastn_query_length"],
"BLASTn subject length": cluster["blastn_subject_length"],
"BLASTn evalue": cluster["blastn_evalue"],
"BLASTn subject SH": cluster["blastn_sh_id"],
"BLASTn full taxonomy": cluster["blastn_full_taxonomy"]
})
clusters_df = pd.DataFrame(clusters_data)
clusters_df["Barcode"] = json_data["barcode_id"]
clusters_df["Number of clusters"] = json_data["number_of_clusters"]
clusters_df["Total reads after filtering"] = json_data["total_reads_after_filtering"]
# Reorder columns
cols = ["Barcode", "Number of clusters", "Total reads after filtering"] + \
[col for col in clusters_df.columns if col not in ["Barcode", "Number of clusters", "Total reads after filtering"]]
clusters_df = clusters_df[cols]
json_dfs.append(clusters_df)
if len(json_dfs) == 0:
df = pd.DataFrame()
else:
df = pd.concat(json_dfs, ignore_index=True)
# apply rel abu threshold
df = df[df["Cluster relative abundance"] >= args.rel_abu_threshold]
df = df.sort_values(by=["Barcode", "Cluster relative abundance"], ascending=[True, False])
if empty_json_dfs:
# put nonprocessed barcodes on top
to_concat = empty_json_dfs + [df]
df = pd.concat(to_concat, ignore_index=True)
# mark no itx extraction
if args.itsx_used != 1:
df["Cluster sequence"] = "No ITSx sequence extraction. Taxonomy was assigned based on full, untrimmed sequence."
df.to_excel(f"{args.output}", engine="openpyxl", index=False)