33import csv
44import glob
55import os
6+ import re
7+ from pathlib import Path
68from shutil import copyfile
79
810import xlsxwriter
911
1012
1113METRICS_PATH = "finalreport/"
1214SUMMARY_PATH = "finalreport/summaries/"
15+ PIPELINE_VERSION_RE = re .compile (r"^(?P<name>.+?) Version (?P<version>[^ ]+)$" )
16+ INT_RE = re .compile (r"^[+-]?\d+$" )
17+ FLOAT_RE = re .compile (r"^[+-]?(?:\d+\.\d*|\d*\.\d+)$" )
1318
1419
1520def coerce_value (value ):
1621 value = str (value ).strip ()
17- if value == "" :
22+ if value in ( "" , "-" ) :
1823 return value , None
1924 if value .endswith ("%" ):
20- try :
21- return float (value [:- 1 ]) / 100.0 , "percent"
22- except ValueError :
23- return value , None
24- try :
25- if "." in value :
26- return float (value ), "float"
27- return int (value ), "int"
28- except ValueError :
25+ numeric = value [:- 1 ].strip ().replace ("," , "" )
26+ if INT_RE .match (numeric ) or FLOAT_RE .match (numeric ):
27+ return float (numeric ) / 100.0 , "percent"
2928 return value , None
3029
31-
32- def main (output_name = "metric_summary" ):
33- os .makedirs (METRICS_PATH , exist_ok = True )
34- os .makedirs (SUMMARY_PATH , exist_ok = True )
35-
36- files = sorted (glob .glob ("./*/outs/metrics_summary.csv" ))
37- workbook = xlsxwriter .Workbook (os .path .join (METRICS_PATH , f"{ output_name } .xlsx" ))
38- worksheet = workbook .add_worksheet ("metrics_summary" )
39-
30+ numeric = value .replace ("," , "" )
31+ if INT_RE .match (numeric ):
32+ return int (numeric ), "int"
33+ if FLOAT_RE .match (numeric ):
34+ return float (numeric ), "float"
35+ return value , None
36+
37+
38+ def next_nonempty (lines , start_idx ):
39+ idx = start_idx
40+ while idx < len (lines ) and not lines [idx ].strip ():
41+ idx += 1
42+ return idx
43+
44+
45+ def parse_metadata_line (line ):
46+ content = line [2 :].strip ()
47+ if not content or set (content ) == {"#" }:
48+ return []
49+ if " - " in content :
50+ section , remainder = content .split (" - " , 1 )
51+ entries = []
52+ for item in remainder .split (" | " ):
53+ if ": " in item :
54+ key , value = item .split (": " , 1 )
55+ entries .append ((f"{ section } .{ key } " , value ))
56+ else :
57+ entries .append ((section , item ))
58+ return entries
59+
60+ match = PIPELINE_VERSION_RE .match (content )
61+ if match :
62+ return [
63+ ("Pipeline.Name" , match .group ("name" )),
64+ ("Pipeline.Version" , match .group ("version" )),
65+ ]
66+
67+ return [("Pipeline.Info" , content )]
68+
69+
70+ def flatten_section (section_name , header , rows ):
71+ flattened = {}
72+ if not rows :
73+ return flattened
74+
75+ id_column = None
76+ for candidate in ("Library" , "Bioproduct_Type" ):
77+ if candidate in header :
78+ id_column = candidate
79+ break
80+
81+ multiple_rows = len (rows ) > 1
82+ for row in rows :
83+ row_label = None
84+ if multiple_rows and id_column :
85+ row_label = row .get (id_column , "" )
86+ for column in header :
87+ if column == id_column :
88+ continue
89+ key = f"{ section_name } .{ column } "
90+ if row_label :
91+ key = f"{ section_name } [{ row_label } ].{ column } "
92+ flattened [key ] = row .get (column , "" )
93+ return flattened
94+
95+
96+ def parse_metrics_summary (filename ):
97+ lines = Path (filename ).read_text (encoding = "utf-8" , errors = "replace" ).splitlines ()
98+ metadata = {}
99+ metrics = {}
100+ idx = 0
101+
102+ while idx < len (lines ):
103+ line = lines [idx ].strip ()
104+ if not line :
105+ idx += 1
106+ continue
107+ if line .startswith ("##" ):
108+ for key , value in parse_metadata_line (line ):
109+ metadata [key ] = value
110+ idx += 1
111+ continue
112+ if line .startswith ("#" ):
113+ section_name = line .strip ("#" ).strip ()
114+ idx = next_nonempty (lines , idx + 1 )
115+ if idx >= len (lines ):
116+ break
117+ header_line = lines [idx ].strip ()
118+ if not header_line or header_line .startswith ("#" ):
119+ continue
120+ header = next (csv .reader ([header_line ]))
121+ idx += 1
122+ rows = []
123+ while idx < len (lines ):
124+ candidate = lines [idx ].strip ()
125+ if not candidate :
126+ idx += 1
127+ break
128+ if candidate .startswith ("#" ):
129+ break
130+ values = next (csv .reader ([candidate ]))
131+ rows .append (dict (zip (header , values )))
132+ idx += 1
133+ metrics .update (flatten_section (section_name , header , rows ))
134+ continue
135+ idx += 1
136+
137+ return metadata , metrics
138+
139+
140+ def write_sheet (workbook , name , headers , rows , coerce_numbers = True ):
141+ worksheet = workbook .add_worksheet (name )
40142 format_num = workbook .add_format ({"num_format" : "#,##0" })
41143 format_float = workbook .add_format ({"num_format" : "#,##0.00" })
42144 format_per = workbook .add_format ({"num_format" : "0.00%" })
43145 format_head = workbook .add_format ({"bold" : True , "italic" : True , "text_wrap" : True , "align" : "center" })
44146
45- headers = ["Sample" ]
46- rows = []
47- for filename in files :
48- with open (filename , newline = "" , encoding = "utf-8" ) as handle :
49- reader = csv .DictReader (handle )
50- data = next (reader )
51- sample = filename .split ("/" )[1 ]
52- for key in data :
53- if key not in headers :
54- headers .append (key )
55- rows .append ((sample , data ))
56-
57147 for col , header in enumerate (headers ):
58148 worksheet .write (0 , col , header , format_head )
59149
60- for row_idx , (sample , data ) in enumerate (rows , start = 1 ):
61- worksheet .write (row_idx , 0 , sample )
62- for col_idx , header in enumerate (headers [1 :], start = 1 ):
63- value , kind = coerce_value (data .get (header , "" ))
150+ for row_idx , row in enumerate (rows , start = 1 ):
151+ for col_idx , header in enumerate (headers ):
152+ raw_value = row .get (header , "" )
153+ if not coerce_numbers :
154+ worksheet .write (row_idx , col_idx , raw_value )
155+ continue
156+ value , kind = coerce_value (raw_value )
64157 if kind == "percent" :
65158 worksheet .write (row_idx , col_idx , value , format_per )
66159 elif kind == "float" :
@@ -70,11 +163,45 @@ def main(output_name="metric_summary"):
70163 else :
71164 worksheet .write (row_idx , col_idx , value )
72165
73- worksheet .set_column (0 , len (headers ) - 1 , 18 )
166+ worksheet .set_column (0 , len (headers ) - 1 , 22 )
167+
168+
169+ def main (output_name = "metric_summary" ):
170+ os .makedirs (METRICS_PATH , exist_ok = True )
171+ os .makedirs (SUMMARY_PATH , exist_ok = True )
172+
173+ files = sorted (glob .glob ("./*/*_Metrics_Summary.csv" ))
174+ workbook = xlsxwriter .Workbook (os .path .join (METRICS_PATH , f"{ output_name } .xlsx" ))
175+
176+ summary_headers = ["Sample" ]
177+ metadata_headers = ["Sample" ]
178+ summary_rows = []
179+ metadata_rows = []
180+
181+ for filename in files :
182+ sample = Path (filename ).parent .name
183+ metadata , metrics = parse_metrics_summary (filename )
184+
185+ summary_row = {"Sample" : sample , ** metrics }
186+ metadata_row = {"Sample" : sample , ** metadata }
187+ summary_rows .append (summary_row )
188+ metadata_rows .append (metadata_row )
189+
190+ for key in metrics :
191+ if key not in summary_headers :
192+ summary_headers .append (key )
193+ for key in metadata :
194+ if key not in metadata_headers :
195+ metadata_headers .append (key )
196+
197+ write_sheet (workbook , "metrics_summary" , summary_headers , summary_rows )
198+ if len (metadata_headers ) > 1 :
199+ write_sheet (workbook , "run_metadata" , metadata_headers , metadata_rows , coerce_numbers = False )
200+
74201 workbook .close ()
75202
76- for filename in sorted (glob .glob ("./*/outs/web_summary .html" )):
77- sample = filename . split ( "/" )[ 1 ]
203+ for filename in sorted (glob .glob ("./*/*_Pipeline_Report .html" )):
204+ sample = Path ( filename ). parent . name
78205 copyfile (filename , os .path .join (SUMMARY_PATH , f"{ sample } _web_summary.html" ))
79206
80207
0 commit comments