|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +## Originally written by Jeferyd Yepes and released under the MIT license. |
| 4 | +## See git repository (https://github.com/nf-core/mag) for full license text. |
| 5 | + |
| 6 | +import pandas as pd |
| 7 | +import re |
| 8 | +import argparse |
| 9 | +import sys |
| 10 | +import warnings |
| 11 | + |
| 12 | +def parse_args(args=None): |
| 13 | + parser = argparse.ArgumentParser() |
| 14 | + parser.add_argument("-s", "--summary", metavar="FILE", help="Pipeline summary file.") |
| 15 | + parser.add_argument("-g", "--gunc_summary", metavar="FILE", help="GUNC summary file.") |
| 16 | + |
| 17 | + parser.add_argument( |
| 18 | + "-o", |
| 19 | + "--out", |
| 20 | + required=True, |
| 21 | + metavar="FILE", |
| 22 | + type=argparse.FileType("w"), |
| 23 | + help="Output file containing final bigmag summary.", |
| 24 | + ) |
| 25 | + return parser.parse_args(args) |
| 26 | + |
| 27 | + |
| 28 | +def main(args=None): |
| 29 | + args = parse_args(args) |
| 30 | + |
| 31 | + if ( |
| 32 | + not args.summary |
| 33 | + and not args.gunc_summary |
| 34 | + ): |
| 35 | + sys.exit( |
| 36 | + "No summary specified! " |
| 37 | + "Please specify the pipeline summary and the GUNC summary." |
| 38 | + ) |
| 39 | + |
| 40 | + df_summary = pd.read_csv(args.summary, sep='\t') |
| 41 | + df_summary.columns = df_summary.columns.str.replace(r'(_busco|_checkm2|_checkm|_gtdbtk|_gunc|_quast)$', '', regex=True) |
| 42 | + for i in range(len(df_summary["bin"])): |
| 43 | + name = df_summary["bin"][i] |
| 44 | + name = re.sub(r'\.(fa|fasta)(\..*)?$', '', name) |
| 45 | + df_summary.at[i,"bin"] = name |
| 46 | + df_summary = df_summary.sort_values(by='bin') |
| 47 | + df_summary["bin"] = df_summary["bin"].astype(str) |
| 48 | + |
| 49 | + df_gunc = pd.read_csv(args.gunc_summary, sep='\t') |
| 50 | + df_gunc["genome"] = df_gunc["genome"].astype(str) |
| 51 | + df_gunc = df_gunc.sort_values(by='genome') |
| 52 | + |
| 53 | + df_summary = pd.merge(df_summary, df_gunc, left_on='bin', right_on='genome', how='left') |
| 54 | + |
| 55 | + df_summary.rename(columns={'bin': 'Bin'}, inplace=True) |
| 56 | + columns_to_remove = ['Name', "genome", 'Input_file', 'Assembly', 'Bin Id'] |
| 57 | + df_summary = df_summary.drop(columns=columns_to_remove, errors="ignore") |
| 58 | + |
| 59 | + df_summary['sample'] = None |
| 60 | + for f in range(len(df_summary["Bin"])): |
| 61 | + match = re.search(r'^.*?-.*?-(.*)$', df_summary["Bin"][f]) |
| 62 | + if match: |
| 63 | + name = match.group(1) |
| 64 | + name = re.sub(r'\.(unbinned|noclass)(\..*)?$', '', name) |
| 65 | + name = re.sub(r'\.\d+(\.[^.]+)?$', '', name) |
| 66 | + df_summary.at[f,"sample"] = name |
| 67 | + |
| 68 | + df_summary.to_csv(args.out, sep="\t", index=True) |
| 69 | + |
| 70 | +if __name__ == "__main__": |
| 71 | + sys.exit(main()) |
0 commit comments