WIP testing idea of CaptureMixin and CaptureWrapper for estimators.

patcon · patcon · commit 10744cb31c95 · 2025-04-24T13:37:35.000-04:00
diff --git a/reddwarf/implementations/polis.py b/reddwarf/implementations/polis.py
@@ -71,12 +71,13 @@ def run_clustering(
     """
     raw_vote_matrix = generate_raw_matrix(votes=votes)
 
+    # Not used, just for output below, until we can get it into pipeline for inspection.
     filtered_vote_matrix = simple_filter_matrix(
         vote_matrix=raw_vote_matrix,
         mod_out_statement_ids=mod_out_statement_ids,
     )
 
-    projected_participants, projected_statements, pca = run_pca(vote_matrix=filtered_vote_matrix)
+    projected_participants, projected_statements, pca = run_pca(vote_matrix=raw_vote_matrix, mod_out_statement_ids=mod_out_statement_ids)
 
     participant_ids_clusterable = get_clusterable_participant_ids(raw_vote_matrix, vote_threshold=min_user_vote_threshold)
     if keep_participant_ids:
diff --git a/reddwarf/sklearn/transformers.py b/reddwarf/sklearn/transformers.py
@@ -102,6 +102,8 @@ def _calculate_scaling_factors(self):
         X_sparse = self._resolve_X_sparse()
         return calculate_scaling_factors(X_sparse=X_sparse)
 
+# TODO: Replace to CaptureMixin and CaptureWrapper.
+# See: https://chatgpt.com/c/680a512a-f604-800b-8922-1992a8ddf491
 class SparsityAwareCapturer(BaseEstimator, TransformerMixin):
     """
     A passthrough transformer that captures and stores the X it receives in
diff --git a/reddwarf/utils/matrix.py b/reddwarf/utils/matrix.py
@@ -112,9 +112,9 @@ def get_unvoted_statement_ids(vote_matrix: VoteMatrix) -> List[int]:
     return null_column_ids
 
 def simple_filter_matrix(
-        vote_matrix: VoteMatrix,
+        vote_matrix: VoteMatrix | np.ndarray,
         mod_out_statement_ids: list[int] = [],
-) -> VoteMatrix:
+) -> VoteMatrix | np.ndarray:
     """
     The simple filter on the vote_matrix that is used by Polis prior to running PCA.
 
@@ -125,14 +125,22 @@ def simple_filter_matrix(
     Returns:
         VoteMatrix: Copy of vote_matrix with statements zero'd out
     """
-    vote_matrix = vote_matrix.copy()
-    for tid in mod_out_statement_ids:
-        # Zero out column only if already exists (ie. has votes)
-        if tid in vote_matrix.columns:
-            # TODO: Add a flag to try np.nan instead of zero.
-            vote_matrix.loc[:, tid] = 0
-
-    return vote_matrix
+    if isinstance(vote_matrix, pd.DataFrame):
+        vote_matrix = vote_matrix.copy()
+        for col in mod_out_statement_ids:
+            if col in vote_matrix.columns:
+                vote_matrix[col] = 0
+        return vote_matrix
+
+    elif isinstance(vote_matrix, np.ndarray):
+        vote_matrix = vote_matrix.copy()
+        for col in mod_out_statement_ids:
+            if isinstance(col, int) and 0 <= col < vote_matrix.shape[1]:
+                vote_matrix[:, col] = 0
+        return vote_matrix
+
+    else:
+        raise TypeError("vote_matrix must be a pandas DataFrame or a NumPy ndarray.")
 
 def get_clusterable_participant_ids(vote_matrix: VoteMatrix, vote_threshold: int) -> list:
     """
diff --git a/reddwarf/utils/pca.py b/reddwarf/utils/pca.py
@@ -1,18 +1,36 @@
 from numpy.typing import ArrayLike
 import pandas as pd
 import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.pipeline import FunctionTransformer
 from reddwarf.utils.matrix import VoteMatrix, generate_virtual_vote_matrix
 from reddwarf.sklearn.transformers import SparsityAwareCapturer, SparsityAwareScaler
 from reddwarf.sklearn.pipeline import PatchedPipeline
-from typing import Tuple
+from typing import Optional, Tuple
 
 from sklearn.decomposition import PCA
 from sklearn.impute import SimpleImputer
 
+from reddwarf.utils.matrix import simple_filter_matrix
+
+class ModerationFilterTransformer(BaseEstimator, TransformerMixin):
+    """
+    Transformer that zero's out specific moderated columns.
+    """
+    def __init__(self, columns_to_filter: list[int] = []):
+        self.columns_to_filter = columns_to_filter
+
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X):
+        return simple_filter_matrix(X, self.columns_to_filter)
+
 
 def run_pca(
         vote_matrix: VoteMatrix,
         n_components: int = 2,
+        mod_out_statement_ids: list[int] = [],
 ) -> Tuple[ pd.DataFrame, pd.DataFrame, PCA ]:
     """
     Process a prepared vote matrix to be imputed and return projected participant data,
@@ -31,23 +49,35 @@ def run_pca(
             - explained_variance_ (List[float]): Explained variance of each principal component.
             - mean_ (list[float]): Means/centers of each column/statements/features.
     """
+    X_raw = vote_matrix.values
+    # moderation = ModerationFilterTransformer(columns_to_filter=mod_out_statement_ids)
+    # X_moderated = simple_filter_matrix(X_raw)
+
+    # filtered_vote_matrix = simple_filter_matrix(
+    #     vote_matrix=vote_matrix,
+    #     mod_out_statement_ids=mod_out_statement_ids,
+    # )
+
     pipeline = PatchedPipeline([
+        ("moderate", ModerationFilterTransformer()),
         ("capture", SparsityAwareCapturer()),
         ("impute", SimpleImputer(missing_values=np.nan, strategy="mean")),
         ("pca", PCA(n_components=n_components)),
         ("scale", SparsityAwareScaler(capture_step="capture")),
     ])
 
-    pipeline.fit(vote_matrix.values)
 
     # Generate projections of participants.
-    X_participants = pipeline.transform(vote_matrix.values)
+    pipeline.named_steps["moderate"].columns_to_filter = mod_out_statement_ids
+    pipeline.fit(X_raw)
+    X_participants = pipeline.transform(X_raw)
 
     # Generate projections of statements via virtual vote matrix.
     # This projects unit vectors for each feature/statement into PCA space to
     # understand their placement.
     n_statements = len(vote_matrix.columns)
     virtual_vote_matrix = generate_virtual_vote_matrix(n_statements)
+    pipeline.named_steps["moderate"].columns_to_filter = []
     X_statements = pipeline.transform(virtual_vote_matrix)
 
     DEFAULT_DIMENSION_LABELS = ["x", "y", "z"]