fix(stats): penalize divided groups in group-aware consensus

nicobao · nicobao · commit 6f59da64bc0c · 2026-03-04T21:21:46.000+01:00
The group-aware consensus score used raw p_agree per group in the
geometric mean, completely ignoring p_disagree. This meant a group
genuinely divided (similar levels of agree and disagree) contributed
the same score as an undivided group with the same agree level,
allowing divided groups to be masked by other groups' strong agreement.

Replace raw p_agree with "effective agreement": p_agree * (1 - p_disagree).
This discounts each group's agreement by its disagreement, so a divided
group naturally drags down the consensus score while still producing a
continuous ranking.

Also document all divergences from the original Polis algorithm:
- Geometric mean normalization (existing)
- Effective agreement (new)
- Progressive confidence lowering for representative statements (existing)
- Significance-based repful_for determination (existing)
diff --git a/reddwarf/utils/stats.py b/reddwarf/utils/stats.py
@@ -100,6 +100,11 @@ def is_statement_significant(row: pd.Series, confidence=0.90) -> bool:
     return is_agreement_significant or is_disagreement_significant
 
 
+# DIVERGENCE FROM POLIS: Polis determines repful_for using the higher of
+# ra (agree representativeness ratio) vs rd (disagree representativeness ratio).
+# We instead use significance tests (rat/rdt z-scores) with a confidence
+# threshold, falling back to the raw z-score comparison only when neither
+# direction passes the significance test.
 def get_statement_repful_for(
     row: pd.Series, confidence=0.90
 ) -> Literal["agree", "disagree"]:
@@ -338,14 +343,29 @@ def calculate_comment_statistics(
         )  # rdt
 
     # Calculate group-aware consensus
-    # Geometric mean: normalize for group count so that similar levels of
-    # cross-group consensus produce similar scores regardless of whether
-    # the conversation has 2 or 6 opinion groups. This helps when applying
-    # a selection algorithm with a fixed threshold (e.g. 0.5).
-    # Reference: https://github.com/compdemocracy/polis/blob/edge/math/src/polismath/math/conversation.clj#L615-L636
+    # Reference (original Polis): https://github.com/compdemocracy/polis/blob/edge/math/src/polismath/math/conversation.clj#L615-L636
+    #
+    # DIVERGENCE #1 FROM POLIS (geometric mean):
+    # Polis uses a raw product of per-group probabilities. This shrinks
+    # exponentially with more groups, making consensus scores unreachable
+    # for conversations with 4-6 opinion groups. We use a geometric mean
+    # (prod^(1/n_groups)) so that similar levels of cross-group consensus
+    # produce similar scores regardless of group count.
+    #
+    # DIVERGENCE #2 FROM POLIS (effective agreement):
+    # Polis uses raw p_agree per group, ignoring p_disagree entirely. This
+    # means a group that is genuinely divided (similar levels of agree and
+    # disagree) contributes the same score as an undivided group with the
+    # same agree level — allowing divided groups to be masked by other
+    # groups' strong agreement. We fix this by using "effective agreement":
+    # p_agree * (1 - p_disagree), which discounts each group's agreement
+    # by its disagreement so a divided group naturally drags down the
+    # consensus score.
     n_groups = P_v_g_c.shape[1]
-    C_v_c[votes.A, :] = P_v_g_c[votes.A, :, :].prod(axis=0) ** (1.0 / n_groups)
-    C_v_c[votes.D, :] = P_v_g_c[votes.D, :, :].prod(axis=0) ** (1.0 / n_groups)
+    effective_agree = P_v_g_c[votes.A, :, :] * (1 - P_v_g_c[votes.D, :, :])
+    effective_disagree = P_v_g_c[votes.D, :, :] * (1 - P_v_g_c[votes.A, :, :])
+    C_v_c[votes.A, :] = effective_agree.prod(axis=0) ** (1.0 / n_groups)
+    C_v_c[votes.D, :] = effective_disagree.prod(axis=0) ** (1.0 / n_groups)
 
     return (
         N_g_c,  # ns
@@ -581,6 +601,13 @@ def priority_metric(
 # Figuring out select-rep-comments flow
 # See: https://github.com/compdemocracy/polis/blob/7bf9eccc287586e51d96fdf519ae6da98e0f4a70/math/src/polismath/math/repness.clj#L209C7-L209C26
 # TODO: omg please clean this up.
+#
+# DIVERGENCE FROM POLIS: Polis uses a fixed confidence level and may return
+# fewer than pick_max statements when not enough pass the significance test.
+# We progressively lower the confidence from the initial value down to 0.60
+# (in 0.05 steps) to fill up to pick_max representative statements. If no
+# statements pass even the lowest confidence, we fall back to the single
+# best statement by repness z-score.
 def select_representative_statements(
     grouped_stats_df: pd.DataFrame,
     mod_out_statement_ids: list[int] = [],
@@ -590,8 +617,6 @@ def select_representative_statements(
     """
     Selects statistically representative statements from each group cluster.
 
-    This is expected to match the Polis outputs when all defaults are set.
-
     Args:
         grouped_stats_df (pd.DataFrame): MultiIndex Dataframe of statement statistics, indexed by group and statement.
         mod_out_statement_ids (list[int]): A list of statements to ignore from selection algorithm
diff --git a/tests/utils/test_stats.py b/tests/utils/test_stats.py
@@ -311,6 +311,12 @@ def test_priority_metric_array():
 
 @pytest.mark.parametrize("polis_convo_data", ["small-no-meta", "small-with-meta", "medium-no-meta", "medium-with-meta"], indirect=True)
 def test_group_aware_consensus_real_data(polis_convo_data):
+    """
+    Verify group-aware consensus on real Polis data.
+
+    Tests both the Polis-compatible baseline (geometric mean of raw p_agree)
+    and our effective agreement divergence (p_agree * (1 - p_disagree)).
+    """
     fixture = polis_convo_data
     loader = Loader(filepaths=[
         f'{fixture.data_dir}/votes.json',
@@ -333,20 +339,34 @@ def test_group_aware_consensus_real_data(polis_convo_data):
     all_clustered_participant_ids, cluster_labels = polismath.extract_data_from_polismath(fixture.math_data)
     vote_matrix = vote_matrix.loc[all_clustered_participant_ids, :]
 
-    # Generate stats all groups and all statements.
-    _, gac_df = stats.calculate_comment_statistics_dataframes(
+    # Generate stats for all groups and all statements.
+    # This returns both per-group probabilities (P_v_g_c) and
+    # the final group-aware consensus (C_v_c) which uses effective agreement.
+    N_g_c, N_v_g_c, P_v_g_c, _, P_v_g_c_test, _, C_v_c = stats.calculate_comment_statistics(
         vote_matrix=vote_matrix,
         cluster_labels=cluster_labels,
     )
 
-    calculated_gac = {
-        str(pid): float(row.iloc[0])
-        for pid, row in gac_df.iterrows()
-    }
-
     n_groups = len(set(cluster_labels))
-    expected_gac = helpers.polis_gac_to_geometric_mean(n_groups, fixture.math_data["group-aware-consensus"])
-    assert calculated_gac == pytest.approx(expected_gac)
+
+    # 1) Verify Polis baseline: geometric mean of raw p_agree still matches fixtures
+    polis_baseline_agree = P_v_g_c[stats.votes.A, :, :].prod(axis=0) ** (1.0 / n_groups)
+    calculated_baseline = {
+        str(sid): float(polis_baseline_agree[i])
+        for i, sid in enumerate(vote_matrix.columns)
+    }
+    expected_polis_gac = helpers.polis_gac_to_geometric_mean(n_groups, fixture.math_data["group-aware-consensus"])
+    assert calculated_baseline == pytest.approx(expected_polis_gac)
+
+    # 2) Verify effective agreement scores are in [0, 1] and <= Polis baseline
+    for i, sid in enumerate(vote_matrix.columns):
+        effective_score = float(C_v_c[stats.votes.A, i])
+        baseline_score = float(polis_baseline_agree[i])
+        assert 0 <= effective_score <= 1, f"Statement {sid}: score {effective_score} out of [0, 1]"
+        assert effective_score <= baseline_score + 1e-9, (
+            f"Statement {sid}: effective agreement {effective_score} should be "
+            f"<= Polis baseline {baseline_score}"
+        )
 
 
 def test_group_aware_consensus_uses_geometric_mean():
@@ -377,16 +397,59 @@ def test_group_aware_consensus_uses_geometric_mean():
     agree_score_2_groups = C_2[0, 0]  # votes.A = 0
     agree_score_3_groups = C_3[0, 0]
 
-    # With geometric mean, both should be close (same underlying consensus).
-    # Without it (raw product), 3 groups would give 0.512 vs 0.640 — much wider gap.
-    # Small difference remains due to Laplace smoothing on smaller groups.
-    assert agree_score_2_groups == pytest.approx(agree_score_3_groups, abs=0.06)
+    # With geometric mean, both should be reasonably close (same underlying consensus).
+    # Without it (raw product), 3 groups would give much lower scores.
+    # The effective agreement formula (p_agree * (1 - p_disagree)) amplifies
+    # the Laplace smoothing gap between group sizes, so the tolerance is wider
+    # than with raw p_agree alone.
+    assert agree_score_2_groups == pytest.approx(agree_score_3_groups, abs=0.1)
 
-    # Both should be well above 0.5 (all participants agree)
+    # Both should be above 0.5 (all participants agree)
     assert agree_score_2_groups > 0.5
     assert agree_score_3_groups > 0.5
 
 
+def test_group_aware_consensus_penalizes_divided_groups():
+    """
+    Verify that a group split roughly evenly between agree and disagree
+    drags down the consensus score compared to unanimous agreement.
+
+    This is the core behavior of the effective agreement divergence from Polis.
+    """
+    # 2 groups of 5 participants, 1 statement.
+    # Group 0: all agree. Group 1: split 3 agree / 2 disagree.
+    vote_matrix_divided = pd.DataFrame(
+        {0: [1, 1, 1, 1, 1, 1, 1, 1, -1, -1]},
+        index=list(range(10)),
+    )
+    cluster_labels = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    # Unanimous: both groups fully agree
+    vote_matrix_unanimous = pd.DataFrame(
+        {0: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
+        index=list(range(10)),
+    )
+
+    *_, C_divided = stats.calculate_comment_statistics(
+        vote_matrix=vote_matrix_divided,
+        cluster_labels=cluster_labels,
+    )
+    *_, C_unanimous = stats.calculate_comment_statistics(
+        vote_matrix=vote_matrix_unanimous,
+        cluster_labels=cluster_labels,
+    )
+
+    divided_score = C_divided[stats.votes.A, 0]
+    unanimous_score = C_unanimous[stats.votes.A, 0]
+
+    # Divided group should significantly lower the consensus score
+    assert divided_score < unanimous_score
+    # A group split 3/2 should produce a score below 0.5 (not genuine consensus)
+    assert divided_score < 0.5
+    # Unanimous agreement should be well above 0.5
+    assert unanimous_score > 0.5
+
+
 def test_format_comment_stats_repful_agree():
     statement = pd.Series({
         "statement_id": 1,