polis-community
diff --git a/‎CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎docs/api_reference.md‎
Lines changed: 0 additions & 4 deletions b/‎docs/api_reference.md‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎docs/notebooks/polis-implementation-demo.ipynb‎
Lines changed: 1 addition & 1 deletion b/‎docs/notebooks/polis-implementation-demo.ipynb‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎reddwarf/implementations/polis.py‎
Lines changed: 46 additions & 13 deletions b/‎reddwarf/implementations/polis.py‎
Lines changed: 46 additions & 13 deletions
diff --git a/‎reddwarf/sklearn/pipeline.py‎
Lines changed: 35 additions & 0 deletions b/‎reddwarf/sklearn/pipeline.py‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎reddwarf/sklearn/transformers.py‎
Lines changed: 56 additions & 6 deletions b/‎reddwarf/sklearn/transformers.py‎
Lines changed: 56 additions & 6 deletions
diff --git a/‎reddwarf/utils/matrix.py‎
Lines changed: 18 additions & 29 deletions b/‎reddwarf/utils/matrix.py‎
Lines changed: 18 additions & 29 deletions
@@ -6,6 +6,7 @@
 - Allow `is_strict_moderation` to be inferred from not just API data, but file data.
 - Better handle numpy divide-by-zero edge-cases in two-property test. ([#28](https://github.com/polis-community/red-dwarf/pull/28))
 - Fix bug where `vote_matrix` was modified directly, leading to subtle side-effects.
+- Fix bug in `select_representative_statements()` where mod-out statements weren't ignored.
 
 ### Changes
 - Fixed participant projections to map more closely to Polis with `utils.pca.sparsity_aware_project_ptpt()`.
@@ -38,6 +39,11 @@
 - Add group statement stats to MultiIndex DataFrame.
 - Add `reddwarf.data_presenter.print_repress()` for printing representative statements.
 - Add support for `Loader()` importing data from alternative Polis instances via `polis_instance_url` arg.
+- Patch sklearn with a simple `PatchedPipeline`, to allow pipeline steps to access other steps.
+- Modify `SparsityAwareScaler` to be able to use captured output from SparsityAware Capture.
+- Remove ported Polis PCA functions that are no longer used.
+- Remove old `impute_missing_votes()` function that's no longer used.
+- In `PolisClusteringResult`, created new `statements_df` and `participants_df` with all raw calculation values.
 
 ### Chores
 - Moved agora implementation from `reddwarf.agora` to `reddwarf.implementations.agora` (deprecation warning).
 
@@ -77,10 +77,6 @@ use in Scikit-Learn workflows, pipelines, and APIs.
     options:
         show_root_heading: true
 
-### ::: reddwarf.utils.impute_missing_votes
-    options:
-        show_root_heading: true
-
 ### ::: reddwarf.utils.get_unvoted_statement_ids
     options:
         show_root_heading: true
 
@@ -269,7 +269,7 @@
         "from reddwarf.utils.stats import select_representative_statements\n",
         "from reddwarf.data_presenter import print_repness\n",
         "\n",
-        "repness = select_representative_statements(grouped_stats_df=result.group_comment_stats)\n",
+        "repness = select_representative_statements(grouped_stats_df=result.group_comment_stats, mod_out_statement_ids=mod_out_statement_ids)\n",
         "print_repness(repness=repness, statements_data=statements)\n"
       ],
       "metadata": {},
 
@@ -1,14 +1,14 @@
 from typing import Optional
-from numpy.typing import NDArray
 from pandas import DataFrame
 from sklearn.decomposition import PCA
 from reddwarf.sklearn.cluster import PolisKMeans
 from reddwarf.utils.matrix import generate_raw_matrix, simple_filter_matrix, get_clusterable_participant_ids
 from reddwarf.utils.pca import run_pca
 from reddwarf.utils.clustering import find_optimal_k
 from dataclasses import dataclass
+import pandas as pd
 
-from reddwarf.utils.stats import calculate_comment_statistics_dataframes
+from reddwarf.utils.stats import calculate_comment_statistics_dataframes, populate_priority_calculations_into_statements_df
 
 @dataclass
 class PolisClusteringResult:
@@ -22,6 +22,8 @@ class PolisClusteringResult:
         kmeans (PolisKMeans): Scikit-Learn KMeans object for selected group count, including `labels_` and `cluster_centers_`. See `PolisKMeans`.
         group_aware_consensus (DataFrame): Group-aware consensus scores for each statement.
         group_comment_stats (DataFrame): A multi-index dataframes for each statement, indexed by group ID and statement.
+        statements_df (DataFrame): A dataframe with all intermediary and final statement data/calculations/metadata.
+        participants_df (DataFrame): A dataframe with all intermediary and final participant data/calculations/metadata.
     """
     raw_vote_matrix: DataFrame
     filtered_vote_matrix: DataFrame
@@ -31,10 +33,13 @@ class PolisClusteringResult:
     kmeans: PolisKMeans | None
     group_aware_consensus: DataFrame
     group_comment_stats: DataFrame
+    statements_df: DataFrame
+    participants_df: DataFrame
 
 def run_clustering(
     votes: list[dict],
     mod_out_statement_ids: list[int] = [],
+    meta_statement_ids: list[int] = [],
     min_user_vote_threshold: int = 7,
     keep_participant_ids: list[int] = [],
     init_centers: Optional[list[list[float]]] = None,
@@ -53,6 +58,7 @@ def run_clustering(
     Args:
         votes (list[dict]): Raw list of vote dicts, with keys for "participant_id", "statement_id", "vote" and "modified"
         mod_out_statement_ids (list[int]): List of statement IDs to moderate/zero out
+        meta_statement_ids (list[int]): List of meta statement IDs
         min_user_vote_threshold (int): Minimum number of votes a participant must make to be included in clustering
         keep_participant_ids (list[int]): List of participant IDs to keep in clustering algorithm, regardless of normal filters.
         max_group_count (): Max number of group (k-values) to test using k-means and silhouette scores
@@ -70,42 +76,69 @@ def run_clustering(
         mod_out_statement_ids=mod_out_statement_ids,
     )
 
-    projected_participants, projected_statements, pca = run_pca(vote_matrix=filtered_vote_matrix)
+    # Run PCA and generate participant/statement projections.
+    # DataFrames each have "x" and "y" columns.
+    participants_df, statements_df, pca = run_pca(vote_matrix=filtered_vote_matrix)
 
-    participant_ids_clusterable = get_clusterable_participant_ids(raw_vote_matrix, vote_threshold=min_user_vote_threshold)
+    participant_ids_to_cluster = get_clusterable_participant_ids(raw_vote_matrix, vote_threshold=min_user_vote_threshold)
     if keep_participant_ids:
-        participant_ids_clusterable = list(set(participant_ids_clusterable + keep_participant_ids))
+        # TODO: Make this an intersection, in case there are members of
+        # keep_participant_ids list that aren't represented in vote_matrix.
+        participant_ids_to_cluster = sorted(list(set(participant_ids_to_cluster + keep_participant_ids)))
 
     if force_group_count:
         k_bounds = [force_group_count, force_group_count]
     else:
         k_bounds = [2, max_group_count]
 
-    projected_participants_clusterable = projected_participants.loc[participant_ids_clusterable, :]
     _, _, kmeans = find_optimal_k(
-        projected_data=projected_participants_clusterable,
+        projected_data=participants_df.loc[participant_ids_to_cluster, :],
         k_bounds=k_bounds,
         # Force polis strategy of initiating cluster centers. See: PolisKMeans.
         init="polis",
         init_centers=init_centers,
         random_state=random_state,
     )
-    projected_participants_clusterable = projected_participants_clusterable.assign(
-        cluster_id=kmeans.labels_ if kmeans else None,
+    label_series = pd.Series(
+        kmeans.labels_ if kmeans else None,
+        index=participant_ids_to_cluster,
+        dtype="Int64", # Allows nullable/NaN values.
     )
+    participants_df["to_cluster"] = participants_df.index.isin(participant_ids_to_cluster)
+    participants_df["cluster_id"] = label_series
 
     grouped_stats_df, gac_df = calculate_comment_statistics_dataframes(
-        vote_matrix=raw_vote_matrix.loc[participant_ids_clusterable, :],
+        vote_matrix=raw_vote_matrix.loc[participant_ids_to_cluster, :],
         cluster_labels=kmeans.labels_,
     )
 
+    def get_with_default(lst, idx, default=None):
+        try:
+            return lst[idx]
+        except IndexError:
+            return default
+
+    statements_df["to_zero"] = statements_df.index.isin(mod_out_statement_ids)
+    statements_df["is_meta"] = statements_df.index.isin(meta_statement_ids)
+    statements_df["mean"] = pca.mean_
+    statements_df["pc1"] = get_with_default(pca.components_, 0)
+    statements_df["pc2"] = get_with_default(pca.components_, 1)
+    statements_df["pc3"] = get_with_default(pca.components_, 2)
+    statements_df = pd.concat([statements_df, gac_df], axis=1)
+    statements_df = populate_priority_calculations_into_statements_df(
+        statements_df=statements_df,
+        vote_matrix=raw_vote_matrix.loc[participant_ids_to_cluster, :],
+    )
+
     return PolisClusteringResult(
         raw_vote_matrix=raw_vote_matrix,
         filtered_vote_matrix=filtered_vote_matrix,
         pca=pca,
-        projected_participants=projected_participants_clusterable,
-        projected_statements=projected_statements,
+        projected_participants=participants_df.loc[participant_ids_to_cluster, ["x", "y", "cluster_id"]], # deprecate?
+        projected_statements=statements_df.loc[:, ["x", "y"]], # deprecate?
         kmeans=kmeans,
-        group_aware_consensus=gac_df,
+        group_aware_consensus=gac_df, # deprecate?
         group_comment_stats=grouped_stats_df,
+        statements_df=statements_df,
+        participants_df=participants_df,
     )
@@ -0,0 +1,35 @@
+from sklearn.pipeline import Pipeline
+
+
+class PatchedPipeline(Pipeline):
+    """
+    A subclass of sklearn's Pipeline that injects a `_parent_pipeline` attribute into each step.
+
+    This allows individual transformers in the pipeline to access their parent pipeline and,
+    by extension, other steps within it. Useful for custom transformers that depend on
+    intermediate results from earlier steps (e.g., SparsityAwareScaler using SparsityAwareCapturer output).
+
+    Example:
+    ```
+    pipeline = PatchedPipeline([
+        ("capture", SparsityAwareCapturer()),
+        ("scale", SparsityAwareScaler(capture_step="capture")),
+    ])
+
+    # Inside SparsityAwareScaler.transform():
+    # capture_step = self._parent_pipeline.named_steps["capture"]
+    # X_sparse = capture_step.X_transformed_
+    ```
+
+    Note:
+        - Steps must support attribute assignment (`__dict__`) to receive the reference.
+        - `_parent_pipeline` is injected once during initialization.
+    """
+    def __init__(self, steps, **kwargs):
+        super().__init__(steps, **kwargs)
+        self._patch_steps()
+
+    def _patch_steps(self):
+        for _, step in self.steps:
+            if hasattr(step, '__dict__'):
+                step._parent_pipeline = self
@@ -7,6 +7,11 @@
 Array2D = NDArray[np.float64]
 
 def calculate_scaling_factors(X_sparse: Array1D | Array2D) -> Array1D:
+    """
+    Calculate row-based scaling factors from the sparse vote matrix.
+
+    (Outside estimator so available for re-use.)
+    """
     # This allows function to work for 2D (full vote_matrix) and 1D (participant_votes).
     # It essentially nest an 1D matrix in a 2D one.
     X_sparse = np.atleast_2d(X_sparse)
@@ -29,14 +34,16 @@ def calculate_scaling_factors(X_sparse: Array1D | Array2D) -> Array1D:
 
 class SparsityAwareScaler(BaseEstimator, TransformerMixin):
     """
-    Scale projected points (participant/statements) based on sparsity of vote
+    Scale projected points (participant or statements) based on sparsity of vote
     matrix, to account for any small number of votes by a participant and
     prevent those participants from bunching up in the center.
 
     Attributes:
+        capture_step (str | int | None): Name or index of the capture step in the pipeline.
         X_sparse (np.ndarray | None): A sparse array with shape (n_features,)
     """
-    def __init__(self, X_sparse: Optional[Array1D | Array2D] = None):
+    def __init__(self, capture_step: Optional[str | int] = None, X_sparse: Optional[Array1D | Array2D] = None):
+        self.capture_step = capture_step
         self.X_sparse = X_sparse
 
     # See: https://scikit-learn.org/stable/modules/generated/sklearn.utils.Tags.html#sklearn.utils.Tags
@@ -57,10 +64,53 @@ def inverse_transform(self, X):
         scaling_factors = self._calculate_scaling_factors()
         return X / scaling_factors[:, np.newaxis]
 
-    def _calculate_scaling_factors(self):
-        if self.X_sparse is None:
+
+    def _get_pipeline_step(self, step):
+        """
+        Fetch the parent pipeline when available via PatchedPipeline usage.
+        """
+        parent = getattr(self, "_parent_pipeline", None)
+        if parent is None:
+            raise RuntimeError(
+                f"{self.__class__.__name__} cannot resolve `capture_step={step}` "
+                "because it is not being used inside a `PatchedPipeline`. "
+                "Either use a `PatchedPipeline` or pass `X_sparse` directly."
+            )
+        if isinstance(step, str):
+            return parent.named_steps[step]
+        elif isinstance(step, int):
+            return parent.steps[step][1]
+        else:
+            raise ValueError("`capture_step` must be a string (name) or int (index).")
+
+    def _resolve_X_sparse(self):
+        """
+        Resolve X_sparse (a sparse vote matrix) from argument or prior capture step.
+        """
+        if self.X_sparse is not None:
+            return self.X_sparse
+
+        capture = self._get_pipeline_step(self.capture_step)
+        if not hasattr(capture, "X_captured_"):
             raise AttributeError(
-                "Missing `X_sparse`. Pass `X_sparse` when initializing SparsityAwareScaler."
+                f"Step '{self.capture_step}' does not contain `.X_captured_`. "
+                f"Did you run `fit/transform` on the pipeline?"
             )
+        return capture.X_captured_
 
-        return calculate_scaling_factors(X_sparse=self.X_sparse)
+    def _calculate_scaling_factors(self):
+        X_sparse = self._resolve_X_sparse()
+        return calculate_scaling_factors(X_sparse=X_sparse)
+
+class SparsityAwareCapturer(BaseEstimator, TransformerMixin):
+    """
+    A passthrough transformer that captures and stores the X it receives in
+    `self.X_captured_`. Useful in pipelines where a later step needs access to
+    this intermediate result.
+    """
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X):
+        self.X_captured_ = X  # Store the actual input value
+        return X
@@ -7,34 +7,6 @@
 
 VoteMatrix: TypeAlias = pd.DataFrame
 
-def impute_missing_votes(vote_matrix: VoteMatrix) -> VoteMatrix:
-    """
-    Imputes missing votes in a voting matrix using column-wise mean. All columns must have at least one vote.
-
-    Reference:
-        Small, C. (2021). "Polis: Scaling Deliberation by Mapping High Dimensional Opinion Spaces."
-        Specific highlight: <https://hyp.is/8zUyWM5fEe-uIO-J34vbkg/gwern.net/doc/sociology/2021-small.pdf>
-
-    Args:
-        vote_matrix (pd.DataFrame):  A vote matrix DataFrame with `NaN`/`None` values where: \
-                                        1. rows are voters, \
-                                        2. columns are statements, and \
-                                        3. values are votes.
-
-    Returns:
-        imputed_matrix (pd.DataFrame): The same vote matrix DataFrame imputing missing values with column mean.
-    """
-    if vote_matrix.isna().all(axis="rows").any():
-        raise RedDwarfError("impute_missing_votes does not support vote matrices containing statement columns with no votes.")
-
-    mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
-    imputed_matrix = pd.DataFrame(
-        mean_imputer.fit_transform(vote_matrix),
-        columns=vote_matrix.columns,
-        index=vote_matrix.index,
-    )
-    return imputed_matrix
-
 def filter_votes(
         votes: List[Dict],
         cutoff: Optional[int] = None,
@@ -232,4 +204,21 @@ def filter_matrix(
     elif unvoted_filter_type == 'zero':
         vote_matrix[unvoted_statement_ids] = 0
 
-    return vote_matrix
+    return vote_matrix
+
+def generate_virtual_vote_matrix(n_statements: int):
+    """
+    Creates a matrix of virtual participants, each of whom vote agree on a
+    single statement, with no other votes. (This is a variation of an "identity
+    matrix", with votes going across the diagonal of a full NaN matrix.)
+    """
+    # Build an basic identity matrix
+    virtual_vote_matrix = np.eye(n_statements)
+
+    # Replace 1s with +1 and 0s with NaN
+    # TODO: Why does Polis use -1 (disagree) here? is it the same? BUG?
+    AGREE_VAL = 1
+    MISSING_VAL = np.nan
+    virtual_vote_matrix = np.where(virtual_vote_matrix == 1, AGREE_VAL, MISSING_VAL)
+
+    return virtual_vote_matrix