Skip to content

Commit b889e16

Browse files
authored
Improve TSV parsing with better error handling
Enhanced error handling and header validation in TSV parsing.
1 parent 9dd7f92 commit b889e16

File tree

1 file changed

+32
-11
lines changed

1 file changed

+32
-11
lines changed

modules/ebi-metagenomics/pathofact2/extractfasta/resources/usr/bin/pathofact_fasta_extractor.py

Lines changed: 32 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -127,31 +127,53 @@ def parse_blastp_best_hits(blastp_out):
127127
)
128128
return best
129129

130-
131130
def parse_pathofact2_predictions_tsv(tsv_path, threshold):
132131
"""
133132
Parse PathoFact2 TSV output and filter by a probability threshold.
134-
Header: Sequence Prediction Probability
135133
136-
Store: {sequence_id: probability} for predictions with probability >= threshold
134+
Expected header:
135+
Sequence Prediction Probability
136+
137+
Returns:
138+
{sequence_id: probability} for predictions with probability >= threshold
139+
140+
Raises:
141+
ValueError: if the file structure is not as expected.
137142
"""
138143
preds = {}
139144
if not _file_is_readable(tsv_path):
140145
return preds
141146

147+
expected_header = ["Sequence", "Prediction", "Probability"]
148+
142149
with _open_text_maybe_gzip(tsv_path) as handle:
143150
reader = csv.reader(handle, delimiter="\t")
144-
header = next(reader)
145151

146-
if len(header) < 3 or header[0] != "Sequence":
147-
logger.warning("Unexpected header in %s: %s", tsv_path, header)
152+
header = next(reader, None)
153+
if header is None:
154+
raise ValueError(f"Empty PathoFact2 TSV (no header): {tsv_path}")
155+
156+
header = [h.strip() for h in header]
157+
if header[:3] != expected_header:
158+
raise ValueError(
159+
f"Unexpected header in {tsv_path}. "
160+
f"Expected first 3 columns {expected_header} but got {header[:3]}"
161+
)
148162

149-
for row in reader:
150-
if not row or len(row) < 3:
151-
continue
163+
for line_no, row in enumerate(reader, start=2):
164+
if len(row) != 3:
165+
raise ValueError(
166+
f"Malformed row in '{tsv_path}' at line {line_no}: {row}"
167+
)
152168

153169
seq_id = row[0]
154-
probability = float(row[2])
170+
try:
171+
probability = float(row[2])
172+
except ValueError as e:
173+
raise ValueError(
174+
f"Invalid Probability value '{row[2]}' "
175+
f"in '{tsv_path}' at line {line_no}"
176+
) from e
155177

156178
if probability >= threshold:
157179
preds[seq_id] = probability
@@ -164,7 +186,6 @@ def parse_pathofact2_predictions_tsv(tsv_path, threshold):
164186
)
165187
return preds
166188

167-
168189
def collect_detected_sequence_ids(blast_hits, tox_preds, vf_preds):
169190
"""Union of all sequence IDs detected by any method."""
170191
ids = set()

0 commit comments

Comments
 (0)