Skip to content

Commit 6f59da6

Browse files
committed
fix(stats): penalize divided groups in group-aware consensus
The group-aware consensus score used raw p_agree per group in the geometric mean, completely ignoring p_disagree. This meant a group genuinely divided (similar levels of agree and disagree) contributed the same score as an undivided group with the same agree level, allowing divided groups to be masked by other groups' strong agreement. Replace raw p_agree with "effective agreement": p_agree * (1 - p_disagree). This discounts each group's agreement by its disagreement, so a divided group naturally drags down the consensus score while still producing a continuous ranking. Also document all divergences from the original Polis algorithm: - Geometric mean normalization (existing) - Effective agreement (new) - Progressive confidence lowering for representative statements (existing) - Significance-based repful_for determination (existing)
1 parent e63c771 commit 6f59da6

2 files changed

Lines changed: 111 additions & 23 deletions

File tree

reddwarf/utils/stats.py

Lines changed: 34 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,11 @@ def is_statement_significant(row: pd.Series, confidence=0.90) -> bool:
100100
return is_agreement_significant or is_disagreement_significant
101101

102102

103+
# DIVERGENCE FROM POLIS: Polis determines repful_for using the higher of
104+
# ra (agree representativeness ratio) vs rd (disagree representativeness ratio).
105+
# We instead use significance tests (rat/rdt z-scores) with a confidence
106+
# threshold, falling back to the raw z-score comparison only when neither
107+
# direction passes the significance test.
103108
def get_statement_repful_for(
104109
row: pd.Series, confidence=0.90
105110
) -> Literal["agree", "disagree"]:
@@ -338,14 +343,29 @@ def calculate_comment_statistics(
338343
) # rdt
339344

340345
# Calculate group-aware consensus
341-
# Geometric mean: normalize for group count so that similar levels of
342-
# cross-group consensus produce similar scores regardless of whether
343-
# the conversation has 2 or 6 opinion groups. This helps when applying
344-
# a selection algorithm with a fixed threshold (e.g. 0.5).
345-
# Reference: https://github.com/compdemocracy/polis/blob/edge/math/src/polismath/math/conversation.clj#L615-L636
346+
# Reference (original Polis): https://github.com/compdemocracy/polis/blob/edge/math/src/polismath/math/conversation.clj#L615-L636
347+
#
348+
# DIVERGENCE #1 FROM POLIS (geometric mean):
349+
# Polis uses a raw product of per-group probabilities. This shrinks
350+
# exponentially with more groups, making consensus scores unreachable
351+
# for conversations with 4-6 opinion groups. We use a geometric mean
352+
# (prod^(1/n_groups)) so that similar levels of cross-group consensus
353+
# produce similar scores regardless of group count.
354+
#
355+
# DIVERGENCE #2 FROM POLIS (effective agreement):
356+
# Polis uses raw p_agree per group, ignoring p_disagree entirely. This
357+
# means a group that is genuinely divided (similar levels of agree and
358+
# disagree) contributes the same score as an undivided group with the
359+
# same agree level — allowing divided groups to be masked by other
360+
# groups' strong agreement. We fix this by using "effective agreement":
361+
# p_agree * (1 - p_disagree), which discounts each group's agreement
362+
# by its disagreement so a divided group naturally drags down the
363+
# consensus score.
346364
n_groups = P_v_g_c.shape[1]
347-
C_v_c[votes.A, :] = P_v_g_c[votes.A, :, :].prod(axis=0) ** (1.0 / n_groups)
348-
C_v_c[votes.D, :] = P_v_g_c[votes.D, :, :].prod(axis=0) ** (1.0 / n_groups)
365+
effective_agree = P_v_g_c[votes.A, :, :] * (1 - P_v_g_c[votes.D, :, :])
366+
effective_disagree = P_v_g_c[votes.D, :, :] * (1 - P_v_g_c[votes.A, :, :])
367+
C_v_c[votes.A, :] = effective_agree.prod(axis=0) ** (1.0 / n_groups)
368+
C_v_c[votes.D, :] = effective_disagree.prod(axis=0) ** (1.0 / n_groups)
349369

350370
return (
351371
N_g_c, # ns
@@ -581,6 +601,13 @@ def priority_metric(
581601
# Figuring out select-rep-comments flow
582602
# See: https://github.com/compdemocracy/polis/blob/7bf9eccc287586e51d96fdf519ae6da98e0f4a70/math/src/polismath/math/repness.clj#L209C7-L209C26
583603
# TODO: omg please clean this up.
604+
#
605+
# DIVERGENCE FROM POLIS: Polis uses a fixed confidence level and may return
606+
# fewer than pick_max statements when not enough pass the significance test.
607+
# We progressively lower the confidence from the initial value down to 0.60
608+
# (in 0.05 steps) to fill up to pick_max representative statements. If no
609+
# statements pass even the lowest confidence, we fall back to the single
610+
# best statement by repness z-score.
584611
def select_representative_statements(
585612
grouped_stats_df: pd.DataFrame,
586613
mod_out_statement_ids: list[int] = [],
@@ -590,8 +617,6 @@ def select_representative_statements(
590617
"""
591618
Selects statistically representative statements from each group cluster.
592619
593-
This is expected to match the Polis outputs when all defaults are set.
594-
595620
Args:
596621
grouped_stats_df (pd.DataFrame): MultiIndex Dataframe of statement statistics, indexed by group and statement.
597622
mod_out_statement_ids (list[int]): A list of statements to ignore from selection algorithm

tests/utils/test_stats.py

Lines changed: 77 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -311,6 +311,12 @@ def test_priority_metric_array():
311311

312312
@pytest.mark.parametrize("polis_convo_data", ["small-no-meta", "small-with-meta", "medium-no-meta", "medium-with-meta"], indirect=True)
313313
def test_group_aware_consensus_real_data(polis_convo_data):
314+
"""
315+
Verify group-aware consensus on real Polis data.
316+
317+
Tests both the Polis-compatible baseline (geometric mean of raw p_agree)
318+
and our effective agreement divergence (p_agree * (1 - p_disagree)).
319+
"""
314320
fixture = polis_convo_data
315321
loader = Loader(filepaths=[
316322
f'{fixture.data_dir}/votes.json',
@@ -333,20 +339,34 @@ def test_group_aware_consensus_real_data(polis_convo_data):
333339
all_clustered_participant_ids, cluster_labels = polismath.extract_data_from_polismath(fixture.math_data)
334340
vote_matrix = vote_matrix.loc[all_clustered_participant_ids, :]
335341

336-
# Generate stats all groups and all statements.
337-
_, gac_df = stats.calculate_comment_statistics_dataframes(
342+
# Generate stats for all groups and all statements.
343+
# This returns both per-group probabilities (P_v_g_c) and
344+
# the final group-aware consensus (C_v_c) which uses effective agreement.
345+
N_g_c, N_v_g_c, P_v_g_c, _, P_v_g_c_test, _, C_v_c = stats.calculate_comment_statistics(
338346
vote_matrix=vote_matrix,
339347
cluster_labels=cluster_labels,
340348
)
341349

342-
calculated_gac = {
343-
str(pid): float(row.iloc[0])
344-
for pid, row in gac_df.iterrows()
345-
}
346-
347350
n_groups = len(set(cluster_labels))
348-
expected_gac = helpers.polis_gac_to_geometric_mean(n_groups, fixture.math_data["group-aware-consensus"])
349-
assert calculated_gac == pytest.approx(expected_gac)
351+
352+
# 1) Verify Polis baseline: geometric mean of raw p_agree still matches fixtures
353+
polis_baseline_agree = P_v_g_c[stats.votes.A, :, :].prod(axis=0) ** (1.0 / n_groups)
354+
calculated_baseline = {
355+
str(sid): float(polis_baseline_agree[i])
356+
for i, sid in enumerate(vote_matrix.columns)
357+
}
358+
expected_polis_gac = helpers.polis_gac_to_geometric_mean(n_groups, fixture.math_data["group-aware-consensus"])
359+
assert calculated_baseline == pytest.approx(expected_polis_gac)
360+
361+
# 2) Verify effective agreement scores are in [0, 1] and <= Polis baseline
362+
for i, sid in enumerate(vote_matrix.columns):
363+
effective_score = float(C_v_c[stats.votes.A, i])
364+
baseline_score = float(polis_baseline_agree[i])
365+
assert 0 <= effective_score <= 1, f"Statement {sid}: score {effective_score} out of [0, 1]"
366+
assert effective_score <= baseline_score + 1e-9, (
367+
f"Statement {sid}: effective agreement {effective_score} should be "
368+
f"<= Polis baseline {baseline_score}"
369+
)
350370

351371

352372
def test_group_aware_consensus_uses_geometric_mean():
@@ -377,16 +397,59 @@ def test_group_aware_consensus_uses_geometric_mean():
377397
agree_score_2_groups = C_2[0, 0] # votes.A = 0
378398
agree_score_3_groups = C_3[0, 0]
379399

380-
# With geometric mean, both should be close (same underlying consensus).
381-
# Without it (raw product), 3 groups would give 0.512 vs 0.640 — much wider gap.
382-
# Small difference remains due to Laplace smoothing on smaller groups.
383-
assert agree_score_2_groups == pytest.approx(agree_score_3_groups, abs=0.06)
400+
# With geometric mean, both should be reasonably close (same underlying consensus).
401+
# Without it (raw product), 3 groups would give much lower scores.
402+
# The effective agreement formula (p_agree * (1 - p_disagree)) amplifies
403+
# the Laplace smoothing gap between group sizes, so the tolerance is wider
404+
# than with raw p_agree alone.
405+
assert agree_score_2_groups == pytest.approx(agree_score_3_groups, abs=0.1)
384406

385-
# Both should be well above 0.5 (all participants agree)
407+
# Both should be above 0.5 (all participants agree)
386408
assert agree_score_2_groups > 0.5
387409
assert agree_score_3_groups > 0.5
388410

389411

412+
def test_group_aware_consensus_penalizes_divided_groups():
413+
"""
414+
Verify that a group split roughly evenly between agree and disagree
415+
drags down the consensus score compared to unanimous agreement.
416+
417+
This is the core behavior of the effective agreement divergence from Polis.
418+
"""
419+
# 2 groups of 5 participants, 1 statement.
420+
# Group 0: all agree. Group 1: split 3 agree / 2 disagree.
421+
vote_matrix_divided = pd.DataFrame(
422+
{0: [1, 1, 1, 1, 1, 1, 1, 1, -1, -1]},
423+
index=list(range(10)),
424+
)
425+
cluster_labels = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
426+
427+
# Unanimous: both groups fully agree
428+
vote_matrix_unanimous = pd.DataFrame(
429+
{0: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]},
430+
index=list(range(10)),
431+
)
432+
433+
*_, C_divided = stats.calculate_comment_statistics(
434+
vote_matrix=vote_matrix_divided,
435+
cluster_labels=cluster_labels,
436+
)
437+
*_, C_unanimous = stats.calculate_comment_statistics(
438+
vote_matrix=vote_matrix_unanimous,
439+
cluster_labels=cluster_labels,
440+
)
441+
442+
divided_score = C_divided[stats.votes.A, 0]
443+
unanimous_score = C_unanimous[stats.votes.A, 0]
444+
445+
# Divided group should significantly lower the consensus score
446+
assert divided_score < unanimous_score
447+
# A group split 3/2 should produce a score below 0.5 (not genuine consensus)
448+
assert divided_score < 0.5
449+
# Unanimous agreement should be well above 0.5
450+
assert unanimous_score > 0.5
451+
452+
390453
def test_format_comment_stats_repful_agree():
391454
statement = pd.Series({
392455
"statement_id": 1,

0 commit comments

Comments
 (0)