Remove functions for old ported PCA code, and unused imputation.

patcon · patcon · commit 524ab6974a3d · 2025-04-24T01:13:18.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -41,6 +41,8 @@
 - Add support for `Loader()` importing data from alternative Polis instances via `polis_instance_url` arg.
 - Patch sklearn with a simple `PatchedPipeline`, to allow pipeline steps to access other steps.
 - Modify `SparsityAwareScaler` to be able to use captured output from SparsityAware Capture.
+- Remove ported Polis PCA functions that are no longer used.
+- Remove old `impute_missing_votes()` function that's no longer used.
 
 ### Chores
 - Moved agora implementation from `reddwarf.agora` to `reddwarf.implementations.agora` (deprecation warning).
diff --git a/docs/api_reference.md b/docs/api_reference.md
@@ -77,10 +77,6 @@ use in Scikit-Learn workflows, pipelines, and APIs.
     options:
         show_root_heading: true
 
-### ::: reddwarf.utils.impute_missing_votes
-    options:
-        show_root_heading: true
-
 ### ::: reddwarf.utils.get_unvoted_statement_ids
     options:
         show_root_heading: true
diff --git a/reddwarf/utils/matrix.py b/reddwarf/utils/matrix.py
@@ -7,34 +7,6 @@
 
 VoteMatrix: TypeAlias = pd.DataFrame
 
-def impute_missing_votes(vote_matrix: VoteMatrix) -> VoteMatrix:
-    """
-    Imputes missing votes in a voting matrix using column-wise mean. All columns must have at least one vote.
-
-    Reference:
-        Small, C. (2021). "Polis: Scaling Deliberation by Mapping High Dimensional Opinion Spaces."
-        Specific highlight: <https://hyp.is/8zUyWM5fEe-uIO-J34vbkg/gwern.net/doc/sociology/2021-small.pdf>
-
-    Args:
-        vote_matrix (pd.DataFrame):  A vote matrix DataFrame with `NaN`/`None` values where: \
-                                        1. rows are voters, \
-                                        2. columns are statements, and \
-                                        3. values are votes.
-
-    Returns:
-        imputed_matrix (pd.DataFrame): The same vote matrix DataFrame imputing missing values with column mean.
-    """
-    if vote_matrix.isna().all(axis="rows").any():
-        raise RedDwarfError("impute_missing_votes does not support vote matrices containing statement columns with no votes.")
-
-    mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
-    imputed_matrix = pd.DataFrame(
-        mean_imputer.fit_transform(vote_matrix),
-        columns=vote_matrix.columns,
-        index=vote_matrix.index,
-    )
-    return imputed_matrix
-
 def filter_votes(
         votes: List[Dict],
         cutoff: Optional[int] = None,
diff --git a/reddwarf/utils/pca.py b/reddwarf/utils/pca.py
@@ -2,7 +2,7 @@
 import pandas as pd
 import numpy as np
 from reddwarf.utils.matrix import VoteMatrix, generate_virtual_vote_matrix
-from reddwarf.sklearn.transformers import SparsityAwareCapturer, SparsityAwareScaler, calculate_scaling_factors
+from reddwarf.sklearn.transformers import SparsityAwareCapturer, SparsityAwareScaler
 from reddwarf.sklearn.pipeline import PatchedPipeline
 from typing import Tuple
 
@@ -67,99 +67,7 @@ def run_pca(
 
     return projected_participants, projected_statements, pca
 
-# TODO: Clean up variables and docs.
-def sparsity_aware_project_ptpt(participant_votes, statement_components, statement_means):
-    """
-    Projects a sparse vote vector into PCA space while adjusting for sparsity.
-
-    Args:
-        participant_votes (list): List of participant votes on each statement
-        statement_components (list[list[float]]): Two lists of floats corresponding to the two principal components
-        statement_means (list[float]): List of floats corresponding to the centers/means of each statement
-
-    Returns:
-        projected_coords (list[list[float]]): Two lists corresponding to projected xy coordinates.
-    """
-    statement_components = np.array(statement_components)  # Shape: (2, n_features)
-    statement_means = np.array(statement_means)  # Shape: (n_features,)
-
-    participant_votes = np.array(participant_votes)
-    mask = ~np.isnan(participant_votes)  # Only consider non-null values
-
-    # Extract relevant values
-    x_vals = participant_votes[mask] - statement_means[mask]  # Centered values
-    # TODO: Extend this to work in 3D
-    pc1_vals, pc2_vals = statement_components[:, mask]  # Select only used components
-
-    # Compute dot product projection
-    p1 = np.dot(x_vals, pc1_vals)
-    p2 = np.dot(x_vals, pc2_vals)
-
-    scaling_factor = calculate_scaling_factors(participant_votes)
-
-    coord_projected = np.array([p1, p2])
-    coord_scaled = coord_projected * scaling_factor
-
-    return coord_scaled
-
-# TODO: Clean up variables and docs.
-def sparsity_aware_project_ptpts(vote_matrix, statement_components, statement_means):
-    """
-    Apply sparsity-aware projection to multiple vote vectors.
-    """
-    return np.array([
-        sparsity_aware_project_ptpt(participant_votes, statement_components, statement_means)
-        for participant_votes in vote_matrix]
-    )
-
-# TODO: Clean up variables and docs.
-def pca_project_cmnts(statement_components, statement_means):
-    """
-    Projects unit vectors for each feature into PCA space to understand their placement.
-    """
-
-    # Create a matrix of virtual participants that each vote once on a single statement.
-
-    # Build an basic identity matrix
-    n_statements = len(statement_means)
-    virtual_vote_matrix = np.eye(n_statements)
-
-    # Replace 1s with -1 and 0s with NaN
-    # TODO: Why does Polis use -1 (disagree) here? is it the same? BUG?
-    AGREE_VAL = 1
-    MISSING_VAL = np.nan
-    virtual_vote_matrix = np.where(virtual_vote_matrix == 1, AGREE_VAL, MISSING_VAL)
-
-    statement_projections = sparsity_aware_project_ptpts(
-        virtual_vote_matrix,
-        statement_components,
-        statement_means,
-    )
-
-    return statement_projections
-
 def calculate_extremity(projections: ArrayLike):
     # Compute extremity as vector magnitude on rows.
     # vector magnitude = Euclidean norm = hypotenuse of xy
-    return np.linalg.norm(projections, axis=0)
-
-# TODO: Clean up variables and docs.
-def with_proj_and_extremity(pca):
-    """
-    Compute projection and extremity, then merge into PCA results.
-    """
-    statement_projections = pca_project_cmnts(
-        statement_components=pca["comps"],
-        statement_means=pca["center"],
-    )
-
-    # Flip the axes to get all x together and y together.
-    # 2 sets of 40. shape (2, 40)
-    statement_projections = statement_projections.transpose()
-
-    statement_extremities = calculate_extremity(statement_projections)
-
-    pca["comment-projection"] = statement_projections.tolist()
-    pca["comment-extremity"] = statement_extremities.tolist()
-
-    return pca
+    return np.linalg.norm(projections, axis=0)
diff --git a/tests/implementations/test_polis_legacy.py b/tests/implementations/test_polis_legacy.py
@@ -82,21 +82,6 @@ def test_statement_count(polis_convo_data):
 
     assert client.statement_count == expected_data
 
-@pytest.mark.parametrize("polis_convo_data", ["small-no-meta", "small-with-meta", "medium-with-meta", "medium-no-meta"], indirect=True)
-def test_impute_missing_values(polis_convo_data):
-    fixture = polis_convo_data
-    client = PolisClient(is_strict_moderation=False)
-    client.load_data(filepaths=[
-        f'{fixture.data_dir}/votes.json',
-        f'{fixture.data_dir}/comments.json',
-    ])
-    matrix_with_missing = client.get_matrix(is_filtered=True)
-    matrix_without_missing = utils.impute_missing_votes(matrix_with_missing)
-
-    assert matrix_with_missing.isnull().values.sum() > 0
-    assert matrix_without_missing.isnull().values.sum() == 0
-    assert matrix_with_missing.shape == matrix_without_missing.shape
-
 # This test can't be paramtrized without changes.
 @pytest.mark.parametrize("polis_convo_data", ["small"], indirect=True)
 def test_filtered_participants_grouped(polis_convo_data):
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -267,47 +267,4 @@ def test_filter_matrix_mod_out_statement_with_zero(make_vote_matrix):
     )
     assert statement_count(initial_matrix) == statement_count(filtered_matrix)
 
-    assert (filtered_matrix[1] == 0).all()
-
-def test_impute_missing_votes():
-    # Manually calculated
-    expected_matrix = pd.DataFrame(
-        [
-            [ 0,    1,    0,  1/3],
-            [-1,    0,    0,   -1],
-            [-1,  0.5,    0,    1],
-            [-1,  0.5,    0,    1],
-        ],
-        columns=[0, 1, 2, 3], # statement_ids
-        index=[0, 1, 2, 3], # participant_ids
-        dtype=float,
-    )
-
-    initial_matrix = pd.DataFrame(
-        [
-            [ 0,    1, None, None],
-            [-1,    0, None,   -1],
-            [-1, None, None,    1],
-            [-1, None,    0,    1],
-        ],
-        columns=[0, 1, 2, 3], # statement_ids
-        index=[0, 1, 2, 3], # participant_ids
-        dtype=float,
-    )
-    imputed_matrix = utils.impute_missing_votes(vote_matrix=initial_matrix)
-    assert_frame_equal(imputed_matrix, expected_matrix)
-
-def test_impute_missing_votes_no_vote_statement_error():
-    initial_matrix = pd.DataFrame(
-        [
-            [ 0,    1, None, None],
-            [-1,    0, None,   -1],
-            [-1, None, None,    1],
-            [-1, None, None,    1],
-        ],
-        columns=[0, 1, 2, 3], # statement_ids
-        index=[0, 1, 2, 3], # participant_ids
-        dtype=float,
-    )
-    with pytest.raises(RedDwarfError):
-        utils.impute_missing_votes(vote_matrix=initial_matrix)
+    assert (filtered_matrix[1] == 0).all()
diff --git a/tests/utils/test_pca.py b/tests/utils/test_pca.py
@@ -144,10 +144,6 @@ def test_run_pca_real_data_below_100(polis_convo_data):
         pid = expected["participant_id"]
         assert_array_almost_equal(actual_projected_participants.loc[pid, ["x","y"]].values, expected["xy"])
 
-    # print(actual_projected_statements.values.transpose())
-    # print(expected_pca["comment-projection"])
-    # print(PcaUtils.pca_project_cmnts(actual_pca.components_, actual_pca.mean_).transpose())
-    # print(PcaUtils.pca_project_cmnts(expected_pca["comps"], np.asarray(expected_pca["center"])).transpose())
     assert_array_almost_equal(actual_projected_statements.values.transpose(), expected_pca["comment-projection"])
 
 # TODO: Accomodate sign flips in "medium-no-meta".
@@ -212,21 +208,4 @@ def test_run_pca_real_data_testing():
     assert actual_pca.components_[0] == pytest.approx(expected_pca["comps"][0])
     assert actual_pca.components_[1] == pytest.approx(expected_pca["comps"][1])
 
-    assert actual_pca.mean_ == pytest.approx(expected_pca["center"])
-
-@pytest.mark.parametrize("polis_convo_data", ["small-no-meta", "small-with-meta", "medium-with-meta"], indirect=True)
-def test_with_proj_and_extremity(polis_convo_data):
-    fixture = polis_convo_data
-    # Invert to correct for flipped signs in polismath.
-    math_data = helpers.flip_signs_by_key(nested_dict=fixture.math_data, keys=["pca.center", "pca.comment-projection", "base-clusters.x", "base-clusters.y", "group-clusters[*].center"])
-    expected_pca = math_data["pca"]
-
-    minimal_pca = {
-        "center": expected_pca["center"],
-        "comps": expected_pca["comps"],
-    }
-
-    calculated_pca = PcaUtils.with_proj_and_extremity(pca=minimal_pca)
-
-    assert expected_pca["comment-projection"] == calculated_pca["comment-projection"]
-    assert expected_pca["comment-extremity"] == calculated_pca["comment-extremity"]
+    assert actual_pca.mean_ == pytest.approx(expected_pca["center"])