@@ -127,31 +127,53 @@ def parse_blastp_best_hits(blastp_out):
127127 )
128128 return best
129129
130-
131130def parse_pathofact2_predictions_tsv (tsv_path , threshold ):
132131 """
133132 Parse PathoFact2 TSV output and filter by a probability threshold.
134- Header: Sequence Prediction Probability
135133
136- Store: {sequence_id: probability} for predictions with probability >= threshold
134+ Expected header:
135+ Sequence Prediction Probability
136+
137+ Returns:
138+ {sequence_id: probability} for predictions with probability >= threshold
139+
140+ Raises:
141+ ValueError: if the file structure is not as expected.
137142 """
138143 preds = {}
139144 if not _file_is_readable (tsv_path ):
140145 return preds
141146
147+ expected_header = ["Sequence" , "Prediction" , "Probability" ]
148+
142149 with _open_text_maybe_gzip (tsv_path ) as handle :
143150 reader = csv .reader (handle , delimiter = "\t " )
144- header = next (reader )
145151
146- if len (header ) < 3 or header [0 ] != "Sequence" :
147- logger .warning ("Unexpected header in %s: %s" , tsv_path , header )
152+ header = next (reader , None )
153+ if header is None :
154+ raise ValueError (f"Empty PathoFact2 TSV (no header): { tsv_path } " )
155+
156+ header = [h .strip () for h in header ]
157+ if header [:3 ] != expected_header :
158+ raise ValueError (
159+ f"Unexpected header in { tsv_path } . "
160+ f"Expected first 3 columns { expected_header } but got { header [:3 ]} "
161+ )
148162
149- for row in reader :
150- if not row or len (row ) < 3 :
151- continue
163+ for line_no , row in enumerate (reader , start = 2 ):
164+ if len (row ) != 3 :
165+ raise ValueError (
166+ f"Malformed row in '{ tsv_path } ' at line { line_no } : { row } "
167+ )
152168
153169 seq_id = row [0 ]
154- probability = float (row [2 ])
170+ try :
171+ probability = float (row [2 ])
172+ except ValueError as e :
173+ raise ValueError (
174+ f"Invalid Probability value '{ row [2 ]} ' "
175+ f"in '{ tsv_path } ' at line { line_no } "
176+ ) from e
155177
156178 if probability >= threshold :
157179 preds [seq_id ] = probability
@@ -164,7 +186,6 @@ def parse_pathofact2_predictions_tsv(tsv_path, threshold):
164186 )
165187 return preds
166188
167-
168189def collect_detected_sequence_ids (blast_hits , tox_preds , vf_preds ):
169190 """Union of all sequence IDs detected by any method."""
170191 ids = set ()
0 commit comments