Merge pull request #1040 from broadinstitute/fix/1037-unique-library-ids

dpark01 · web-flow · commit 88b2dd9cbda1 · 2026-02-17T10:13:18.000-05:00
Fix splitcode lookup when library_id_per_sample is unique per sample
diff --git a/.github/copilot-instructions.md b/.github/copilot-instructions.md
@@ -0,0 +1,31 @@
+# Copilot Instructions
+
+This file provides guidance to GitHub Copilot when working with code in this repository.
+
+**IMPORTANT**: Always read [AGENTS.md](../AGENTS.md) at the start of every session before doing any work. It contains comprehensive project context and development guidelines that are essential for working in this codebase.
+
+## Quick Reference
+
+- **Docker-centric development**: Run tests inside containers, not on host
+- **Import pattern**: `from viral_ngs import core` then `core.samtools.SamtoolsTool()`
+- **Test location**: `tests/unit/<module>/`
+- **Dependencies**: ALL via conda, not pip (see `docker/requirements/*.txt`)
+
+## Running Tests
+
+```bash
+docker run --rm \
+  -v $(pwd):/opt/viral-ngs/source \
+  quay.io/broadinstitute/viral-ngs:main-core \
+  pytest -rsxX -n auto /opt/viral-ngs/source/tests/unit
+```
+
+## Key Files
+
+| File | Purpose |
+|------|---------|
+| [AGENTS.md](../AGENTS.md) | Full AI assistant guidance |
+| [pyproject.toml](../pyproject.toml) | Package configuration |
+| [docker/](../docker/) | Dockerfiles and requirements |
+| [src/viral_ngs/](../src/viral_ngs/) | Source code |
+| [tests/](../tests/) | Test files |
diff --git a/AGENTS.md b/AGENTS.md
@@ -69,6 +69,21 @@ docker run --rm \
   pytest -rsxX -n auto /opt/viral-ngs/source/tests/unit/classify
 ```
 
+**Important: Testing source code changes requires re-installing the package.**
+The `-v` mount makes your local files visible on disk, but `viral_ngs` is already installed as a package inside the container image. Python imports resolve to the *installed* copy, not your mounted source files. If you've modified files under `src/viral_ngs/`, you must re-install before running tests:
+
+```bash
+# Run tests with local source changes applied
+docker run --rm \
+  -v $(pwd):/opt/viral-ngs/source \
+  quay.io/broadinstitute/viral-ngs:main-core \
+  bash -c "pip install -e /opt/viral-ngs/source --quiet && pytest -rsxX -n auto /opt/viral-ngs/source/tests/unit"
+```
+
+Changes to test files (`tests/`) and test inputs (`tests/input/`) are picked up automatically via the volume mount — the re-install is only needed when modifying the `src/viral_ngs/` package code.
+
+Running pytest directly on the host will generally not work — most dependencies (bioinformatics tools, conda packages) are only available inside the Docker containers. Always test inside Docker.
+
 **Test conventions:**
 - Uses pytest (not nose or unittest)
 - Test files in `tests/unit/<module>/`
diff --git a/src/viral_ngs/core/splitcode.py b/src/viral_ngs/core/splitcode.py
@@ -344,21 +344,45 @@ def duplication_check(df, primary_cols, secondary_col, error_message_header=None
     pool_dfs      = []
     unmatched_dfs = []
 
-    for pool in barcodes_df["muxed_pool"].unique():
-        # Get and load splitcode stats report json
-        # Use the full pool name (including run suffix) to match the JSON filename created by splitcode
-        pool_for_file_lookup = pool
-
-        # Try to find and load the splitcode summary JSON file
-        # Add robust error handling since missing/misplaced JSON files are a common issue
+    # Build barcode_group column for JSON lookup: outer barcodes only, no library_id.
+    # Splitcode produces ONE summary JSON per outer barcode group (barcode_1+barcode_2),
+    # but muxed_pool includes library_id_per_sample. When samples have unique library_ids,
+    # there are multiple muxed_pool values per barcode group but only one JSON file.
+    def _barcode_group(row):
+        b1 = row.get("barcode_1", "")
+        b2 = row.get("barcode_2", "")
+        if b2 and str(b2).strip():
+            return f"{b1}-{b2}"
+        return b1
+    barcodes_df["barcode_group"] = barcodes_df.apply(_barcode_group, axis=1)
+
+    for barcode_group in barcodes_df["barcode_group"].unique():
+        samplesheet_rows_for_pool_df = barcodes_df[barcodes_df["barcode_group"] == barcode_group]
+
+        # Find the splitcode summary JSON file for this barcode group.
+        # Try each muxed_pool value in the group until we find a matching JSON.
+        # This handles the case where library_id_per_sample differs per sample:
+        # splitcode produces one JSON named after whichever muxed_pool was used as pool_id.
+        splitcode_summary_file = None
+        tried_patterns = []
         try:
-            summary_pattern = f"{outDir}/{pool_for_file_lookup}_summary.json"
-            matching_files = glob.glob(summary_pattern)
-
-            if not matching_files:
+            for candidate_pool in samplesheet_rows_for_pool_df["muxed_pool"].unique():
+                summary_pattern = f"{outDir}/{candidate_pool}_summary.json"
+                tried_patterns.append(summary_pattern)
+                matching_files = glob.glob(summary_pattern)
+                if matching_files:
+                    splitcode_summary_file = matching_files[0]
+                    if len(matching_files) > 1:
+                        log.warning(f"Multiple summary JSON files match pattern '{summary_pattern}':")
+                        for f in matching_files:
+                            log.warning(f"  - {f}")
+                        log.warning(f"Using first match: {splitcode_summary_file}")
+                    break
+
+            if splitcode_summary_file is None:
                 # JSON file not found - list directory contents for debugging
-                log.error(f"Splitcode summary JSON not found for pool '{pool_for_file_lookup}'")
-                log.error(f"  Expected pattern: {summary_pattern}")
+                log.error(f"Splitcode summary JSON not found for barcode group '{barcode_group}'")
+                log.error(f"  Tried patterns: {tried_patterns}")
                 log.error(f"  Searching in directory: {outDir}")
 
                 # List all files in the output directory to help debug
@@ -384,20 +408,11 @@ def duplication_check(df, primary_cols, secondary_col, error_message_header=None
                     log.error(f"  Could not list directory contents: {list_err}")
 
                 raise FileNotFoundError(
-                    f"Splitcode summary JSON not found for pool '{pool_for_file_lookup}'. "
-                    f"Expected file: {summary_pattern}. "
+                    f"Splitcode summary JSON not found for barcode group '{barcode_group}'. "
+                    f"Tried patterns: {tried_patterns}. "
                     f"Check logs above for directory contents."
                 )
 
-            splitcode_summary_file = matching_files[0]
-
-            # Warn if multiple matches found (shouldn't happen but good to catch)
-            if len(matching_files) > 1:
-                log.warning(f"Multiple summary JSON files match pattern '{summary_pattern}':")
-                for f in matching_files:
-                    log.warning(f"  - {f}")
-                log.warning(f"Using first match: {splitcode_summary_file}")
-
             log.debug(f"Loading splitcode summary from: {splitcode_summary_file}")
 
             with open(splitcode_summary_file, "r") as f:
@@ -422,14 +437,12 @@ def duplication_check(df, primary_cols, secondary_col, error_message_header=None
                 log.error(f"  Could not read file for debugging: {read_err}")
             raise
         except Exception as e:
-            log.error(f"Unexpected error loading splitcode summary for pool '{pool_for_file_lookup}'")
-            log.error(f"  File: {splitcode_summary_file if 'splitcode_summary_file' in locals() else 'not determined'}")
+            log.error(f"Unexpected error loading splitcode summary for barcode group '{barcode_group}'")
+            log.error(f"  File: {splitcode_summary_file if splitcode_summary_file else 'not determined'}")
             log.error(f"  Error type: {type(e).__name__}")
             log.error(f"  Error message: {e}")
             raise
 
-        samplesheet_rows_for_pool_df = barcodes_df[barcodes_df["muxed_pool"] == pool]
-
         # Parse splitcode summary JSON
         # IMPORTANT: The tag_qc array has MULTIPLE entries per barcode tag!
         # Each barcode appears once for each hamming distance level (0, 1, 2, 3).
@@ -475,18 +488,18 @@ def duplication_check(df, primary_cols, secondary_col, error_message_header=None
         else:
             # No reads were processed by splitcode for this pool
             # Create a dataframe with the expected schema but all counts set to 0
-            log.warning(f"Pool {pool} has 0 reads processed by splitcode. Creating empty metrics.")
+            log.warning(f"Barcode group {barcode_group} has 0 reads processed by splitcode. Creating empty metrics.")
             samplesheet_rows_for_pool_hx_df = samplesheet_rows_for_pool_df.copy()
             samplesheet_rows_for_pool_hx_df['count'] = 0
             samplesheet_rows_for_pool_hx_df['count_h1'] = 0
 
         pool_dfs.append(samplesheet_rows_for_pool_hx_df)
 
         unmatched_dict = {
-            "sample"                : f"{unmatched_name}.{pool}",
+            "sample"                : f"{unmatched_name}.{barcode_group}",
             "library_id_per_sample" : list(set(samplesheet_rows_for_pool_hx_df["library_id_per_sample"]))[0],
-            "run"                   : f"{unmatched_name}.{pool}",
-            "muxed_pool"            : pool,
+            "run"                   : f"{unmatched_name}.{barcode_group}",
+            "muxed_pool"            : barcode_group,
             "count"                 : splitcode_summary["n_processed"] - splitcode_summary["n_assigned"],
             "count_h1"              : 0,
             "barcode_1"             : list(samplesheet_rows_for_pool_hx_df["barcode_1"])[0],
diff --git a/tests/input/TestSplitcodeLookupTable/ATCGATCG-GCTAGCTA.lL1_summary.json b/tests/input/TestSplitcodeLookupTable/ATCGATCG-GCTAGCTA.lL1_summary.json
@@ -0,0 +1,36 @@
+{
+  "n_processed": 100000,
+  "n_assigned": 95000,
+  "tag_qc": [
+    {
+      "tag": "Sample1.lL1_R1",
+      "distance": 0,
+      "count": 30000
+    },
+    {
+      "tag": "Sample1.lL1_R1",
+      "distance": 1,
+      "count": 1500
+    },
+    {
+      "tag": "Sample2.lL2_R1",
+      "distance": 0,
+      "count": 45000
+    },
+    {
+      "tag": "Sample2.lL2_R1",
+      "distance": 1,
+      "count": 2000
+    },
+    {
+      "tag": "Sample3.lL3_R1",
+      "distance": 0,
+      "count": 15000
+    },
+    {
+      "tag": "Sample3.lL3_R1",
+      "distance": 1,
+      "count": 1500
+    }
+  ]
+}
diff --git a/tests/input/TestSplitcodeLookupTable/sample_sheet_unique_lib_ids.tsv b/tests/input/TestSplitcodeLookupTable/sample_sheet_unique_lib_ids.tsv
@@ -0,0 +1,4 @@
+sample	barcode_1	barcode_2	barcode_3	library_id_per_sample
+Sample1	ATCGATCG	GCTAGCTA	AAAACCCC	L1
+Sample2	ATCGATCG	GCTAGCTA	GGGGTTTT	L2
+Sample3	ATCGATCG	GCTAGCTA	CCCCGGGG	L3
diff --git a/tests/unit/core/test_tools_splitcode.py b/tests/unit/core/test_tools_splitcode.py
@@ -146,7 +146,7 @@ def test_multi_pool(self):
             # Should have 4 samples + 2 unmatched (one per pool)
             self.assertEqual(len(df), 6)
 
-            # Verify both pools are present
+            # Verify both barcode groups are present in muxed_pool
             pools = set(df['muxed_pool'].tolist())
             self.assertIn('AAAAAAAA-TTTTTTTT.lLibA', pools)
             self.assertIn('GGGGGGGG-CCCCCCCC.lLibB', pools)
@@ -156,7 +156,8 @@ def test_multi_pool(self):
             self.assertEqual(len(unmatched_rows), 2)
 
             # Verify LibA unmatched count: 50000 - 48000 = 2000
-            liba_unmatched = df[df['muxed_pool'] == 'AAAAAAAA-TTTTTTTT.lLibA']
+            # Unmatched rows use the barcode group (outer barcodes only) as muxed_pool
+            liba_unmatched = df[df['muxed_pool'] == 'AAAAAAAA-TTTTTTTT']
             liba_unmatched = liba_unmatched[liba_unmatched['sample'].str.contains('Unmatched')].iloc[0]
             self.assertEqual(int(liba_unmatched['num_reads_hdistance0']), 2000)
 
@@ -190,6 +191,66 @@ def test_append_run_id(self):
             self.assertIn('FLOWCELL123', sample_row['run'])
             self.assertIn('FLOWCELL123', sample_row['muxed_pool'])
 
+    def test_unique_library_ids_per_sample(self):
+        """Test that lookup works when library_id_per_sample differs per sample
+        within the same barcode_1+barcode_2 group (issue #1037).
+
+        When each sample has a unique library_id_per_sample, each gets its own
+        muxed_pool value, but splitcode produces only ONE summary JSON per outer
+        barcode group. The function must find and reuse that single JSON for all
+        samples in the group.
+        """
+        inDir = viral_ngs.core.file.get_test_input_path(self)
+        sample_sheet = os.path.join(inDir, 'sample_sheet_unique_lib_ids.tsv')
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            csv_out = os.path.join(tmpdir, 'lut.csv')
+
+            # Copy the ONE JSON that exists for this barcode pair
+            # (named with the first muxed_pool value, as splitcode would produce)
+            shutil.copy(
+                os.path.join(inDir, 'ATCGATCG-GCTAGCTA.lL1_summary.json'),
+                os.path.join(tmpdir, 'ATCGATCG-GCTAGCTA.lL1_summary.json')
+            )
+
+            result_path = viral_ngs.core.splitcode.create_splitcode_lookup_table(
+                sample_sheet, csv_out, unmatched_name="Unmatched"
+            )
+
+            # Verify output file was created
+            self.assertTrue(os.path.exists(result_path))
+
+            # Read and validate output CSV
+            import pandas as pd
+            df = pd.read_csv(result_path, dtype=str)
+
+            # Check number of rows (3 samples + 1 unmatched)
+            self.assertEqual(len(df), 4)
+
+            # Verify all sample names present
+            sample_names = set(df['sample'].tolist())
+            self.assertIn('Sample1', sample_names)
+            self.assertIn('Sample2', sample_names)
+            self.assertIn('Sample3', sample_names)
+
+            # Verify unmatched row
+            unmatched_rows = df[df['sample'].str.contains('Unmatched')]
+            self.assertEqual(len(unmatched_rows), 1)
+            unmatched_row = unmatched_rows.iloc[0]
+            # Unmatched count should be n_processed - n_assigned = 100000 - 95000 = 5000
+            self.assertEqual(int(unmatched_row['num_reads_hdistance0']), 5000)
+
+            # Verify read counts for Sample2 (should have highest count)
+            sample2_row = df[df['sample'] == 'Sample2'].iloc[0]
+            self.assertEqual(int(sample2_row['num_reads_hdistance0']), 45000)
+            self.assertEqual(int(sample2_row['num_reads_hdistance1']), 2000)
+            self.assertEqual(int(sample2_row['num_reads_total']), 47000)
+
+            # Verify barcode values are correct
+            self.assertEqual(sample2_row['barcode_1'], 'ATCGATCG')
+            self.assertEqual(sample2_row['barcode_2'], 'GCTAGCTA')
+            self.assertEqual(sample2_row['inline_barcode'], 'GGGGTTTT')
+
 
 class TestSplitcodeIntegration(TestCaseWithTmp):
     """