@@ -311,6 +311,12 @@ def test_priority_metric_array():
311311
312312@pytest .mark .parametrize ("polis_convo_data" , ["small-no-meta" , "small-with-meta" , "medium-no-meta" , "medium-with-meta" ], indirect = True )
313313def test_group_aware_consensus_real_data (polis_convo_data ):
314+ """
315+ Verify group-aware consensus on real Polis data.
316+
317+ Tests both the Polis-compatible baseline (geometric mean of raw p_agree)
318+ and our effective agreement divergence (p_agree * (1 - p_disagree)).
319+ """
314320 fixture = polis_convo_data
315321 loader = Loader (filepaths = [
316322 f'{ fixture .data_dir } /votes.json' ,
@@ -333,20 +339,34 @@ def test_group_aware_consensus_real_data(polis_convo_data):
333339 all_clustered_participant_ids , cluster_labels = polismath .extract_data_from_polismath (fixture .math_data )
334340 vote_matrix = vote_matrix .loc [all_clustered_participant_ids , :]
335341
336- # Generate stats all groups and all statements.
337- _ , gac_df = stats .calculate_comment_statistics_dataframes (
342+ # Generate stats for all groups and all statements.
343+ # This returns both per-group probabilities (P_v_g_c) and
344+ # the final group-aware consensus (C_v_c) which uses effective agreement.
345+ N_g_c , N_v_g_c , P_v_g_c , _ , P_v_g_c_test , _ , C_v_c = stats .calculate_comment_statistics (
338346 vote_matrix = vote_matrix ,
339347 cluster_labels = cluster_labels ,
340348 )
341349
342- calculated_gac = {
343- str (pid ): float (row .iloc [0 ])
344- for pid , row in gac_df .iterrows ()
345- }
346-
347350 n_groups = len (set (cluster_labels ))
348- expected_gac = helpers .polis_gac_to_geometric_mean (n_groups , fixture .math_data ["group-aware-consensus" ])
349- assert calculated_gac == pytest .approx (expected_gac )
351+
352+ # 1) Verify Polis baseline: geometric mean of raw p_agree still matches fixtures
353+ polis_baseline_agree = P_v_g_c [stats .votes .A , :, :].prod (axis = 0 ) ** (1.0 / n_groups )
354+ calculated_baseline = {
355+ str (sid ): float (polis_baseline_agree [i ])
356+ for i , sid in enumerate (vote_matrix .columns )
357+ }
358+ expected_polis_gac = helpers .polis_gac_to_geometric_mean (n_groups , fixture .math_data ["group-aware-consensus" ])
359+ assert calculated_baseline == pytest .approx (expected_polis_gac )
360+
361+ # 2) Verify effective agreement scores are in [0, 1] and <= Polis baseline
362+ for i , sid in enumerate (vote_matrix .columns ):
363+ effective_score = float (C_v_c [stats .votes .A , i ])
364+ baseline_score = float (polis_baseline_agree [i ])
365+ assert 0 <= effective_score <= 1 , f"Statement { sid } : score { effective_score } out of [0, 1]"
366+ assert effective_score <= baseline_score + 1e-9 , (
367+ f"Statement { sid } : effective agreement { effective_score } should be "
368+ f"<= Polis baseline { baseline_score } "
369+ )
350370
351371
352372def test_group_aware_consensus_uses_geometric_mean ():
@@ -377,16 +397,59 @@ def test_group_aware_consensus_uses_geometric_mean():
377397 agree_score_2_groups = C_2 [0 , 0 ] # votes.A = 0
378398 agree_score_3_groups = C_3 [0 , 0 ]
379399
380- # With geometric mean, both should be close (same underlying consensus).
381- # Without it (raw product), 3 groups would give 0.512 vs 0.640 — much wider gap.
382- # Small difference remains due to Laplace smoothing on smaller groups.
383- assert agree_score_2_groups == pytest .approx (agree_score_3_groups , abs = 0.06 )
400+ # With geometric mean, both should be reasonably close (same underlying consensus).
401+ # Without it (raw product), 3 groups would give much lower scores.
402+ # The effective agreement formula (p_agree * (1 - p_disagree)) amplifies
403+ # the Laplace smoothing gap between group sizes, so the tolerance is wider
404+ # than with raw p_agree alone.
405+ assert agree_score_2_groups == pytest .approx (agree_score_3_groups , abs = 0.1 )
384406
385- # Both should be well above 0.5 (all participants agree)
407+ # Both should be above 0.5 (all participants agree)
386408 assert agree_score_2_groups > 0.5
387409 assert agree_score_3_groups > 0.5
388410
389411
412+ def test_group_aware_consensus_penalizes_divided_groups ():
413+ """
414+ Verify that a group split roughly evenly between agree and disagree
415+ drags down the consensus score compared to unanimous agreement.
416+
417+ This is the core behavior of the effective agreement divergence from Polis.
418+ """
419+ # 2 groups of 5 participants, 1 statement.
420+ # Group 0: all agree. Group 1: split 3 agree / 2 disagree.
421+ vote_matrix_divided = pd .DataFrame (
422+ {0 : [1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , - 1 , - 1 ]},
423+ index = list (range (10 )),
424+ )
425+ cluster_labels = [0 , 0 , 0 , 0 , 0 , 1 , 1 , 1 , 1 , 1 ]
426+
427+ # Unanimous: both groups fully agree
428+ vote_matrix_unanimous = pd .DataFrame (
429+ {0 : [1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ]},
430+ index = list (range (10 )),
431+ )
432+
433+ * _ , C_divided = stats .calculate_comment_statistics (
434+ vote_matrix = vote_matrix_divided ,
435+ cluster_labels = cluster_labels ,
436+ )
437+ * _ , C_unanimous = stats .calculate_comment_statistics (
438+ vote_matrix = vote_matrix_unanimous ,
439+ cluster_labels = cluster_labels ,
440+ )
441+
442+ divided_score = C_divided [stats .votes .A , 0 ]
443+ unanimous_score = C_unanimous [stats .votes .A , 0 ]
444+
445+ # Divided group should significantly lower the consensus score
446+ assert divided_score < unanimous_score
447+ # A group split 3/2 should produce a score below 0.5 (not genuine consensus)
448+ assert divided_score < 0.5
449+ # Unanimous agreement should be well above 0.5
450+ assert unanimous_score > 0.5
451+
452+
390453def test_format_comment_stats_repful_agree ():
391454 statement = pd .Series ({
392455 "statement_id" : 1 ,
0 commit comments