Merge pull request #766 from broadinstitute/kl/row_len

klaricch · web-flow · commit df63d57fe6d2 · 2025-03-21T15:12:27.000-05:00
Edits to check_global_and_row_annot_lengths for efficiency
diff --git a/gnomad/assessment/validity_checks.py b/gnomad/assessment/validity_checks.py
@@ -1117,35 +1117,49 @@ def check_global_and_row_annot_lengths(
     t = t.rows() if isinstance(t, hl.MatrixTable) else t
     if not check_all_rows:
         t = t.head(1)
+
+    n_rows = t.count()
+
+    global_lengths = {
+        global_field: hl.eval(hl.len(t.index_globals()[global_field]))
+        for row_field, global_fields in row_to_globals_check.items()
+        for global_field in global_fields
+    }
+
+    row_length_counts = {
+        row_field: t.aggregate(hl.agg.counter(hl.len(t[row_field])))
+        for row_field in row_to_globals_check.keys()
+    }
+
     for row_field, global_fields in row_to_globals_check.items():
         if not check_all_rows:
             logger.info(
                 "Checking length of %s in first row against length of globals: %s",
                 row_field,
                 global_fields,
             )
+
+        row_lengths = row_length_counts[row_field]
+
         for global_field in global_fields:
-            global_len = hl.eval(hl.len(t[global_field]))
-            row_len_expr = hl.len(t[row_field])
-            failed_rows = t.aggregate(
-                hl.struct(
-                    n_fail=hl.agg.count_where(row_len_expr != global_len),
-                    row_len=hl.agg.counter(row_len_expr),
-                )
+            global_len = global_lengths[global_field]
+            failed_rows = sum(
+                count for length, count in row_lengths.items() if length != global_len
             )
-            outcome = "Failed" if failed_rows["n_fail"] > 0 else "Passed"
-            n_rows = t.count()
+
+            outcome = "Failed" if failed_rows > 0 else "Passed"
+
             logger.info(
                 "%s global and row lengths comparison: Length of %s in"
-                " globals (%d) does %smatch length of %s in %d out of %d rows (%s)",
+                " globals (%d) does %smatch length of %s in %d out of %d rows (row length counter: %s)",
                 outcome,
                 global_field,
                 global_len,
                 "NOT " if outcome == "Failed" else "",
                 row_field,
-                failed_rows["n_fail"] if outcome == "Failed" else n_rows,
+                failed_rows if outcome == "Failed" else n_rows,
                 n_rows,
-                failed_rows["row_len"],
+                row_lengths,
             )
 
 
diff --git a/tests/assessment/test_validity_checks.py b/tests/assessment/test_validity_checks.py
@@ -7,6 +7,7 @@
 import pytest
 
 from gnomad.assessment.validity_checks import (
+    check_global_and_row_annot_lengths,
     check_missingness_of_struct,
     check_raw_and_adj_callstats,
     check_sex_chr_metrics,
@@ -502,6 +503,58 @@ def test_sum_group_callstats(ht_for_group_sums, caplog) -> None:
         ), f"Expected phrase missing: {log_phrase}"
 
 
+@pytest.fixture
+def ht_for_check_global_and_row_annot_lengths() -> hl.Table:
+    """Fixture to set up a Hail Table with the desired structure and data for check_global_and_row_annot_lengths."""
+    ht = hl.Table.parallelize(
+        [
+            {"freq": [0.1, 0.2, 0.3], "faf": [0.01, 0.02]},
+            {"freq": [0.8, 0.4, 0.5], "faf": [0.03, 0.04, 0.05]},
+        ],
+        hl.tstruct(freq=hl.tarray(hl.tfloat64), faf=hl.tarray(hl.tfloat64)),
+    )
+
+    return ht.annotate_globals(
+        freq_meta=["A", "B", "C"],
+        freq_index_dict={"A": 0, "B": 1, "C": 2},
+        freq_meta_sample_count=[100, 200, 300],
+        faf_meta=["D", "E"],
+        faf_index_dict={"D": 0, "E": 1},
+    )
+
+
+def test_check_global_and_row_annot_lengths(
+    ht_for_check_global_and_row_annot_lengths, caplog
+) -> None:
+    """Test that check_global_and_row_annot_lengths produces the expected log messages."""
+    ht = ht_for_check_global_and_row_annot_lengths
+
+    # Define the row_to_globals_check dictionary.
+    row_to_globals_check = {
+        "freq": ["freq_meta", "freq_index_dict", "freq_meta_sample_count"],
+        "faf": ["faf_meta", "faf_index_dict"],
+    }
+
+    with caplog.at_level(logging.INFO, logger="gnomad.assessment.validity_checks"):
+        check_global_and_row_annot_lengths(
+            ht, row_to_globals_check, check_all_rows=True
+        )
+
+    log_messages = [record.message for record in caplog.records]
+
+    # Verify log messages.
+    expected_logs = [
+        "Passed global and row lengths comparison: Length of freq_meta in globals (3) does match length of freq in 2 out of 2 rows (row length counter: {3: 2})",
+        "Passed global and row lengths comparison: Length of freq_index_dict in globals (3) does match length of freq in 2 out of 2 rows (row length counter: {3: 2})",
+        "Passed global and row lengths comparison: Length of freq_meta_sample_count in globals (3) does match length of freq in 2 out of 2 rows (row length counter: {3: 2})",
+        "Failed global and row lengths comparison: Length of faf_meta in globals (2) does NOT match length of faf in 1 out of 2 rows (row length counter: {2: 1, 3: 1})",
+        "Failed global and row lengths comparison: Length of faf_index_dict in globals (2) does NOT match length of faf in 1 out of 2 rows (row length counter: {2: 1, 3: 1})",
+    ]
+
+    for msg in expected_logs:
+        assert msg in log_messages, f"Expected log message is missing: {msg}"
+
+
 @pytest.fixture
 def ht_for_check_raw_and_adj_callstats() -> hl.Table:
     """Fixture to create a Hail Table with the expected structure and test values for check_raw_and_adj_callstats, using underscore as the delimiter."""