Skip to content

Commit e8cee77

Browse files
carzeclaude
andcommitted
fix: correct SAMPLE_ID/DB_ID assignment in summarize_kb_extract_reads
- Sample ID is now derived from the tarball filename rather than the FASTQ filename stem (which was just a number) - DB_ID (palmDB_ID) correctly comes from the tarball subdirectory name, fixing the taxonomy lookup that was returning Unclassified RdRP for all rows Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 4203638 commit e8cee77

1 file changed

Lines changed: 9 additions & 5 deletions

File tree

pipes/WDL/tasks/tasks_metagenomics.wdl

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2230,6 +2230,10 @@ task summarize_kb_extract_reads {
22302230
TAXONOMY_LEVEL = "~{taxonomy_level}"
22312231
OUTPUT_FILE = "~{out_filename}"
22322232
2233+
# Derive sample ID from tarball filename, stripping all extensions (e.g. .tar.gz, .tar.zst)
2234+
tar_basename = os.path.basename("~{extract_reads_tar}")
2235+
sample_id = tar_basename.split('.')[0]
2236+
22332237
# Load taxonomy map: palmDB_ID -> taxonomy lineage
22342238
print("Loading taxonomy map...", file=sys.stderr)
22352239
taxonomy_map = {}
@@ -2294,11 +2298,11 @@ task summarize_kb_extract_reads {
22942298
continue
22952299
22962300
filepath = os.path.join(sample_path, filename)
2297-
# Extract UID from filename (e.g., "UID12345.fastq.gz" -> "UID12345")
2298-
uid = filename.replace('.fastq.gz', '')
2301+
# Directory name is the DB_ID (palmDB_ID); sample ID comes from the tarball name
2302+
db_id = sample_name
22992303
2300-
# Look up taxonomy
2301-
tax_values = taxonomy_map.get(uid, [])
2304+
# Look up taxonomy by DB_ID
2305+
tax_values = taxonomy_map.get(db_id, [])
23022306
23032307
if tax_values:
23042308
if TAXONOMY_LEVEL == "deepest":
@@ -2332,7 +2336,7 @@ task summarize_kb_extract_reads {
23322336
seq_length = len(seq_line)
23332337
23342338
# Write output line
2335-
out_line = f"{sample_name}\t{read_id}\t{uid}\t{taxonomy_str}\t{tax_name}\t{seq_length}\n"
2339+
out_line = f"{sample_id}\t{read_id}\t{db_id}\t{taxonomy_str}\t{tax_name}\t{seq_length}\n"
23362340
compressor.write(out_line.encode('utf-8'))
23372341
reads_processed += 1
23382342

0 commit comments

Comments
 (0)