fix: stream zstd writes and raise memory in parse_kraken2_reads

carze · carze · commit b98195ef6a7b · 2026-04-06T09:56:10.000-04:00
OOM kills were occurring because all read rows were accumulated in a
list before writing, adding gigabytes of Python tuples on top of the
~2.8 GB already used by the taxonomy dicts. Rows are now written to the
zstd compressor immediately as each line is parsed.

Also raises the default machine_mem_gb from 8 to 16 to provide adequate
headroom for the taxonomy dict overhead (~2.8M entries across 4 dicts).
diff --git a/pipes/WDL/tasks/tasks_metagenomics.wdl b/pipes/WDL/tasks/tasks_metagenomics.wdl
@@ -1986,7 +1986,7 @@ task parse_kraken2_reads {
         else sub(basename(kraken2_reads_output), "\\.kraken2\\.reads\\.txt(\\.gz)?$", "")
     Boolean resolve_strains = false
 
-    Int     machine_mem_gb = 8
+    Int     machine_mem_gb = 16
     String  docker = "quay.io/broadinstitute/py3-bio:0.1.5"
   }
 
@@ -2130,70 +2130,59 @@ task parse_kraken2_reads {
         classified_count = 0
         unclassified_count = 0
 
-        # Collect rows
-        rows = []
-
-        try:
-            for line in f:
-                # Skip empty lines
-                line = line.strip()
-                if not line:
-                    continue
-
-                # Parse Kraken2 output format
-                # Format: C/U <read_id> <taxid> <length> <kmer_info>
-                parts = line.split('\t')
+        cctx = zstd.ZstdCompressor()
+        with open(output_file, 'wb') as raw_f:
+            with cctx.stream_writer(raw_f) as compressor:
+                compressor.write(b'SAMPLE_ID\tREAD_ID\tTAXONOMY_ID\tTAX_NAME\tKINGDOM\tTAX_RANK\n')
+                try:
+                    for line in f:
+                        # Skip empty lines
+                        line = line.strip()
+                        if not line:
+                            continue
 
-                if len(parts) < 3:
-                    print(f"Warning: Skipping malformed line: {line[:100]}", file=sys.stderr)
-                    continue
+                        # Parse Kraken2 output format
+                        # Format: C/U <read_id> <taxid> <length> <kmer_info>
+                        parts = line.split('\t')
 
-                classification = parts[0].strip()  # C or U
-                read_id = parts[1].strip()
-                taxid_str = parts[2].strip()
+                        if len(parts) < 3:
+                            print(f"Warning: Skipping malformed line: {line[:100]}", file=sys.stderr)
+                            continue
 
-                # Handle unclassified reads (taxid = 0)
-                try:
-                    taxid = int(taxid_str)
-                except ValueError:
-                    print(f"Warning: Invalid taxid '{taxid_str}' for read {read_id}", file=sys.stderr)
-                    continue
+                        classification = parts[0].strip()  # C or U
+                        read_id = parts[1].strip()
+                        taxid_str = parts[2].strip()
 
-                if classification == 'U':
-                    unclassified_count += 1
-                    tax_name = 'Unclassified'
-                    kingdom = 'Unclassified'
-                    tax_rank = 'unclassified'
-                else:
-                    classified_count += 1
-                    tax_name = tax_db.get_name(taxid)
-                    kingdom = tax_db.get_kingdom(taxid)
-                    tax_rank = tax_db.get_rank(taxid, resolve_strains=resolve_strains)
+                        # Handle unclassified reads (taxid = 0)
+                        try:
+                            taxid = int(taxid_str)
+                        except ValueError:
+                            print(f"Warning: Invalid taxid '{taxid_str}' for read {read_id}", file=sys.stderr)
+                            continue
 
-                rows.append((sample_id, read_id, taxid, tax_name, kingdom, tax_rank))
+                        if classification == 'U':
+                            unclassified_count += 1
+                            tax_name = 'Unclassified'
+                            kingdom = 'Unclassified'
+                            tax_rank = 'unclassified'
+                        else:
+                            classified_count += 1
+                            tax_name = tax_db.get_name(taxid)
+                            kingdom = tax_db.get_kingdom(taxid)
+                            tax_rank = tax_db.get_rank(taxid, resolve_strains=resolve_strains)
 
-        finally:
-            f.close()
+                        row = (sample_id, read_id, taxid, tax_name, kingdom, tax_rank)
+                        compressor.write(('\t'.join(str(v) for v in row) + '\n').encode('utf-8'))
 
-        # Write output as TSV
-        _write_tsv(rows, output_file)
+                finally:
+                    f.close()
 
         print(f"\nProcessing complete:", file=sys.stderr)
         print(f"  Classified reads: {classified_count}", file=sys.stderr)
         print(f"  Unclassified reads: {unclassified_count}", file=sys.stderr)
         print(f"  Total reads: {classified_count + unclassified_count}", file=sys.stderr)
 
 
-    def _write_tsv(rows, output_file):
-        """Write rows as zstd-compressed TSV."""
-        cctx = zstd.ZstdCompressor()
-        with open(output_file, 'wb') as raw_f:
-            with cctx.stream_writer(raw_f) as compressor:
-                compressor.write(b'SAMPLE_ID\tREAD_ID\tTAXONOMY_ID\tTAX_NAME\tKINGDOM\tTAX_RANK\n')
-                for row in rows:
-                    compressor.write(('\t'.join(str(v) for v in row) + '\n').encode('utf-8'))
-
-
     tax_db = DuckDBTaxonomyDatabase("~{taxonomy_db}", resolve_strains=~{true="True" false="False" resolve_strains})
     parse_kraken2_output("~{kraken2_reads_output}", tax_db, "~{out_compressed}", "~{sample_id}")
     CODE