Skip to content

Commit 2e5732c

Browse files
committed
Address Copilot review: docstring clarity, consensus tie-breaking, unparseable header detection, param validation
- Clarify matched_bc1/matched_bc2 docstring to note they contain FASTQ-observed values (not samplesheet values) when skipped_reason is set - Require strict majority (>50%) in consensus vote; return N for ties - Raise ValueError when FASTQ has reads but none have parseable barcode headers (previously returned (None, None) same as empty files, masking malformed FASTQs) - Validate max_barcode_mismatches >= 0 and num_reads_for_barcode >= 1 - Add tests for tie-breaking and unparseable header detection
1 parent a29e148 commit 2e5732c

2 files changed

Lines changed: 57 additions & 5 deletions

File tree

src/viral_ngs/illumina.py

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -320,8 +320,12 @@ def match_barcodes_with_orientation(target_bc1, target_bc2, sample_rows,
320320
matched_rows: List of matching sample rows (empty if no match, ambiguous, or high N fraction)
321321
orientation_info: dict with keys:
322322
- 'barcode_2_revcomp': bool (True if reverse complement was needed)
323-
- 'matched_bc1': str (the samplesheet barcode_1 value that matched)
324-
- 'matched_bc2': str (the samplesheet barcode_2 value that matched)
323+
- 'matched_bc1': str. On successful match (no 'skipped_reason'), this is
324+
the samplesheet barcode_1 value. When 'skipped_reason' is set, this
325+
is the normalized barcode_1 observed in the FASTQ header.
326+
- 'matched_bc2': str. On successful match (no 'skipped_reason'), this is
327+
the samplesheet barcode_2 value. When 'skipped_reason' is set, this
328+
is the normalized barcode_2 observed in the FASTQ header.
325329
- 'skipped_reason': str (if empty results, why - 'high_n_fraction', 'ambiguous', or 'no_match')
326330
"""
327331
log = logging.getLogger(__name__)
@@ -860,19 +864,24 @@ def consensus_barcode_from_fastq(fastq_path, num_reads=10):
860864
861865
Returns:
862866
tuple: (barcode_1, barcode_2_or_None) normalized consensus barcodes,
863-
or (None, None) if file is empty or has no parseable barcodes.
867+
or (None, None) if file is empty (no reads).
868+
869+
Raises:
870+
ValueError: If the FASTQ has reads but none have parseable barcode headers.
864871
"""
865872
log = logging.getLogger(__name__)
866873

867874
bc1_list = []
868875
bc2_list = []
876+
headers_seen = 0
869877

870878
with util_file.open_or_gzopen(fastq_path, 'rt') as f:
871879
reads_found = 0
872880
for line_num, line in enumerate(f):
873881
# FASTQ format: header is every 4th line starting at line 0
874882
if line_num % 4 != 0:
875883
continue
884+
headers_seen += 1
876885
if reads_found >= num_reads:
877886
break
878887

@@ -890,14 +899,27 @@ def consensus_barcode_from_fastq(fastq_path, num_reads=10):
890899
reads_found += 1
891900

892901
if not bc1_list:
902+
if headers_seen > 0:
903+
raise ValueError(
904+
f"FASTQ file has reads but no parseable barcode headers in first "
905+
f"{headers_seen} reads: {fastq_path}"
906+
)
893907
return None, None
894908

895909
def _position_consensus(bases_at_position):
896-
"""Return majority base at a position, ignoring N."""
910+
"""Return majority base at a position, ignoring N.
911+
912+
Requires strict majority (>50% of non-N bases). Returns N if
913+
there is no clear winner (e.g., a tie).
914+
"""
897915
non_n = [b for b in bases_at_position if b != 'N']
898916
if not non_n:
899917
return 'N'
900-
return Counter(non_n).most_common(1)[0][0]
918+
counts = Counter(non_n)
919+
top_base, top_count = counts.most_common(1)[0]
920+
if top_count * 2 <= len(non_n):
921+
return 'N' # no strict majority (tie or split)
922+
return top_base
901923

902924
# Build consensus for bc1
903925
bc1_len = len(bc1_list[0])
@@ -1025,6 +1047,10 @@ def splitcode_demux_fastqs(
10251047
raise FileNotFoundError(f"R2 FASTQ not found: {fastq_r2}")
10261048
if not os.path.exists(samplesheet):
10271049
raise FileNotFoundError(f"Samplesheet not found: {samplesheet}")
1050+
if max_barcode_mismatches < 0:
1051+
raise ValueError(f"max_barcode_mismatches must be >= 0, got {max_barcode_mismatches}")
1052+
if num_reads_for_barcode < 1:
1053+
raise ValueError(f"num_reads_for_barcode must be >= 1, got {num_reads_for_barcode}")
10281054

10291055
# Parse RunInfo.xml if provided (unless flowcell_id and run_date both overridden)
10301056
runinfo_obj = None

tests/unit/core/test_illumina.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1618,6 +1618,32 @@ def test_consensus_resolves_mismatch(self):
16181618
bc1, bc2 = viral_ngs.illumina.consensus_barcode_from_fastq(fq, num_reads=10)
16191619
self.assertEqual(bc1, 'ATCGATCG') # consensus corrects the mismatch
16201620

1621+
def test_consensus_tie_returns_n(self):
1622+
"""Even split (5 A vs 5 G) at a position → N (no strict majority)."""
1623+
fq = os.path.join(self.tmpdir, 'test.fastq.gz')
1624+
reads = []
1625+
for i in range(5):
1626+
reads.append((f"@INST:1:FC:1:1:1:{i} 1:N:0:ATCGATCG+GCTAGCTA", "ACGTACGT"))
1627+
for i in range(5):
1628+
reads.append((f"@INST:1:FC:1:1:1:{5+i} 1:N:0:GTCGATCG+GCTAGCTA", "ACGTACGT"))
1629+
self._write_fastq(fq, reads)
1630+
bc1, bc2 = viral_ngs.illumina.consensus_barcode_from_fastq(fq, num_reads=10)
1631+
self.assertEqual(bc1[0], 'N') # tie at pos 0 → N
1632+
self.assertEqual(bc1[1:], 'TCGATCG') # rest is unanimous
1633+
1634+
def test_unparseable_headers_raises(self):
1635+
"""FASTQ with reads but no parseable barcode headers raises ValueError."""
1636+
fq = os.path.join(self.tmpdir, 'test.fastq.gz')
1637+
with gzip.open(fq, 'wt') as f:
1638+
# Write reads with non-standard headers (no barcode field)
1639+
for i in range(3):
1640+
f.write(f"@read_{i}\n")
1641+
f.write("ACGTACGT\n")
1642+
f.write("+\n")
1643+
f.write("IIIIIIII\n")
1644+
with self.assertRaises(ValueError):
1645+
viral_ngs.illumina.consensus_barcode_from_fastq(fq, num_reads=10)
1646+
16211647

16221648
class TestParseBarcode(unittest.TestCase):
16231649
"""Test _parse_barcode_from_header() helper."""

0 commit comments

Comments
 (0)