Cleaned up sklearn pipeline in run_pca with some modifications.

patcon · patcon · commit ee6ca7a7f7b5 · 2025-04-24T00:44:27.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -39,6 +39,8 @@
 - Add group statement stats to MultiIndex DataFrame.
 - Add `reddwarf.data_presenter.print_repress()` for printing representative statements.
 - Add support for `Loader()` importing data from alternative Polis instances via `polis_instance_url` arg.
+- Patch sklearn with a simple `PatchedPipeline`, to allow pipeline steps to access other steps.
+- Modify `SparsityAwareScaler` to be able to use captured output from SparsityAware Capture.
 
 ### Chores
 - Moved agora implementation from `reddwarf.agora` to `reddwarf.implementations.agora` (deprecation warning).
diff --git a/reddwarf/sklearn/pipeline.py b/reddwarf/sklearn/pipeline.py
@@ -0,0 +1,35 @@
+from sklearn.pipeline import Pipeline
+
+
+class PatchedPipeline(Pipeline):
+    """
+    A subclass of sklearn's Pipeline that injects a `_parent_pipeline` attribute into each step.
+
+    This allows individual transformers in the pipeline to access their parent pipeline and,
+    by extension, other steps within it. Useful for custom transformers that depend on
+    intermediate results from earlier steps (e.g., SparsityAwareScaler using SparsityAwareCapturer output).
+
+    Example:
+    ```
+    pipeline = PatchedPipeline([
+        ("capture", SparsityAwareCapturer()),
+        ("scale", SparsityAwareScaler(capture_step="capture")),
+    ])
+
+    # Inside SparsityAwareScaler.transform():
+    # capture_step = self._parent_pipeline.named_steps["capture"]
+    # X_sparse = capture_step.X_transformed_
+    ```
+
+    Note:
+        - Steps must support attribute assignment (`__dict__`) to receive the reference.
+        - `_parent_pipeline` is injected once during initialization.
+    """
+    def __init__(self, steps, **kwargs):
+        super().__init__(steps, **kwargs)
+        self._patch_steps()
+
+    def _patch_steps(self):
+        for _, step in self.steps:
+            if hasattr(step, '__dict__'):
+                step._parent_pipeline = self
diff --git a/reddwarf/sklearn/transformers.py b/reddwarf/sklearn/transformers.py
@@ -7,6 +7,11 @@
 Array2D = NDArray[np.float64]
 
 def calculate_scaling_factors(X_sparse: Array1D | Array2D) -> Array1D:
+    """
+    Calculate row-based scaling factors from the sparse vote matrix.
+
+    (Outside estimator so available for re-use.)
+    """
     # This allows function to work for 2D (full vote_matrix) and 1D (participant_votes).
     # It essentially nest an 1D matrix in a 2D one.
     X_sparse = np.atleast_2d(X_sparse)
@@ -29,14 +34,16 @@ def calculate_scaling_factors(X_sparse: Array1D | Array2D) -> Array1D:
 
 class SparsityAwareScaler(BaseEstimator, TransformerMixin):
     """
-    Scale projected points (participant/statements) based on sparsity of vote
+    Scale projected points (participant or statements) based on sparsity of vote
     matrix, to account for any small number of votes by a participant and
     prevent those participants from bunching up in the center.
 
     Attributes:
+        capture_step (str | int | None): Name or index of the capture step in the pipeline.
         X_sparse (np.ndarray | None): A sparse array with shape (n_features,)
     """
-    def __init__(self, X_sparse: Optional[Array1D | Array2D] = None):
+    def __init__(self, capture_step: Optional[str | int] = None, X_sparse: Optional[Array1D | Array2D] = None):
+        self.capture_step = capture_step
         self.X_sparse = X_sparse
 
     # See: https://scikit-learn.org/stable/modules/generated/sklearn.utils.Tags.html#sklearn.utils.Tags
@@ -57,10 +64,53 @@ def inverse_transform(self, X):
         scaling_factors = self._calculate_scaling_factors()
         return X / scaling_factors[:, np.newaxis]
 
-    def _calculate_scaling_factors(self):
-        if self.X_sparse is None:
+
+    def _get_pipeline_step(self, step):
+        """
+        Fetch the parent pipeline when available via PatchedPipeline usage.
+        """
+        parent = getattr(self, "_parent_pipeline", None)
+        if parent is None:
+            raise RuntimeError(
+                f"{self.__class__.__name__} cannot resolve `capture_step={step}` "
+                "because it is not being used inside a `PatchedPipeline`. "
+                "Either use a `PatchedPipeline` or pass `X_sparse` directly."
+            )
+        if isinstance(step, str):
+            return parent.named_steps[step]
+        elif isinstance(step, int):
+            return parent.steps[step][1]
+        else:
+            raise ValueError("`capture_step` must be a string (name) or int (index).")
+
+    def _resolve_X_sparse(self):
+        """
+        Resolve X_sparse (a sparse vote matrix) from argument or prior capture step.
+        """
+        if self.X_sparse is not None:
+            return self.X_sparse
+
+        capture = self._get_pipeline_step(self.capture_step)
+        if not hasattr(capture, "X_captured_"):
             raise AttributeError(
-                "Missing `X_sparse`. Pass `X_sparse` when initializing SparsityAwareScaler."
+                f"Step '{self.capture_step}' does not contain `.X_captured_`. "
+                f"Did you run `fit/transform` on the pipeline?"
             )
+        return capture.X_captured_
 
-        return calculate_scaling_factors(X_sparse=self.X_sparse)
+    def _calculate_scaling_factors(self):
+        X_sparse = self._resolve_X_sparse()
+        return calculate_scaling_factors(X_sparse=X_sparse)
+
+class SparsityAwareCapturer(BaseEstimator, TransformerMixin):
+    """
+    A passthrough transformer that captures and stores the X it receives in
+    `self.X_captured_`. Useful in pipelines where a later step needs access to
+    this intermediate result.
+    """
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X):
+        self.X_captured_ = X  # Store the actual input value
+        return X
diff --git a/reddwarf/utils/matrix.py b/reddwarf/utils/matrix.py
@@ -232,4 +232,21 @@ def filter_matrix(
     elif unvoted_filter_type == 'zero':
         vote_matrix[unvoted_statement_ids] = 0
 
-    return vote_matrix
+    return vote_matrix
+
+def generate_virtual_vote_matrix(n_statements: int):
+    """
+    Creates a matrix of virtual participants, each of whom vote agree on a
+    single statement, with no other votes. (This is a variation of an "identity
+    matrix", with votes going across the diagonal of a full NaN matrix.)
+    """
+    # Build an basic identity matrix
+    virtual_vote_matrix = np.eye(n_statements)
+
+    # Replace 1s with +1 and 0s with NaN
+    # TODO: Why does Polis use -1 (disagree) here? is it the same? BUG?
+    AGREE_VAL = 1
+    MISSING_VAL = np.nan
+    virtual_vote_matrix = np.where(virtual_vote_matrix == 1, AGREE_VAL, MISSING_VAL)
+
+    return virtual_vote_matrix
diff --git a/reddwarf/utils/pca.py b/reddwarf/utils/pca.py
@@ -1,13 +1,14 @@
 from numpy.typing import ArrayLike
 import pandas as pd
 import numpy as np
-from reddwarf.utils.matrix import VoteMatrix
-from reddwarf.sklearn.transformers import SparsityAwareScaler, calculate_scaling_factors
+from reddwarf.utils.matrix import VoteMatrix, generate_virtual_vote_matrix
+from reddwarf.sklearn.transformers import SparsityAwareCapturer, SparsityAwareScaler, calculate_scaling_factors
+from reddwarf.sklearn.pipeline import PatchedPipeline
 from typing import Tuple
 
 from sklearn.decomposition import PCA
 from sklearn.impute import SimpleImputer
-from sklearn.pipeline import Pipeline
+
 
 def run_pca(
         vote_matrix: VoteMatrix,
@@ -30,39 +31,22 @@ def run_pca(
             - explained_variance_ (List[float]): Explained variance of each principal component.
             - mean_ (list[float]): Means/centers of each column/statements/features.
     """
-    pipeline = Pipeline([
+    pipeline = PatchedPipeline([
+        ("capture", SparsityAwareCapturer()),
         ("impute", SimpleImputer(missing_values=np.nan, strategy="mean")),
         ("pca", PCA(n_components=n_components)),
-        ("scale", SparsityAwareScaler()),
+        ("scale", SparsityAwareScaler(capture_step="capture")),
     ])
 
     pipeline.fit(vote_matrix.values)
-    pca = pipeline.named_steps["pca"]
-
-    def generate_projections(sparse_vote_matrix, fitted_pipeline):
-        fitted_pipeline.named_steps["scale"].X_sparse = sparse_vote_matrix
-        X_projected = fitted_pipeline.transform(sparse_vote_matrix)
-
-        return X_projected
-
-    # Create a matrix of virtual participants that each vote once on a single statement.
-    def generate_virtual_vote_matrix(n_statements: int):
-        # Build an basic identity matrix
-        virtual_vote_matrix = np.eye(n_statements)
-
-        # Replace 1s with +1 and 0s with NaN
-        # TODO: Why does Polis use -1 (disagree) here? is it the same? BUG?
-        AGREE_VAL = 1
-        MISSING_VAL = np.nan
-        virtual_vote_matrix = np.where(virtual_vote_matrix == 1, AGREE_VAL, MISSING_VAL)
-
-        return virtual_vote_matrix
 
-    X_participants = generate_projections(sparse_vote_matrix=vote_matrix.values, fitted_pipeline=pipeline)
+    # Generate projections of participants.
+    X_participants = pipeline.transform(vote_matrix.values)
 
+    # Generate projections of statements via virtual vote matrix.
     n_statements = len(vote_matrix.columns)
     virtual_vote_matrix = generate_virtual_vote_matrix(n_statements)
-    X_statements = generate_projections(sparse_vote_matrix=virtual_vote_matrix, fitted_pipeline=pipeline)
+    X_statements = pipeline.transform(virtual_vote_matrix)
 
     DEFAULT_DIMENSION_LABELS = ["x", "y", "z"]
     dimension_labels = DEFAULT_DIMENSION_LABELS[:n_components]
@@ -79,6 +63,8 @@ def generate_virtual_vote_matrix(n_statements: int):
         columns=np.asarray(dimension_labels),
     )
 
+    pca = pipeline.named_steps["pca"]
+
     return projected_participants, projected_statements, pca
 
 # TODO: Clean up variables and docs.