Generate 10 principal components for PCA.

patcon · patcon · commit 018964eab328 · 2025-11-24T01:13:28.000-05:00
diff --git a/conf/base/parameters_experimental.yml b/conf/base/parameters_experimental.yml
@@ -28,7 +28,7 @@ pipelines:
 
     reducer:
       name: PCA
-      n_components: 2
+      n_components: 10
 
     scaler:
       name: SparsityAwareScaler
@@ -198,7 +198,7 @@ pipelines:
   #   <<: *knn5d_pacmap_besthdbscanflat
   #   reducer:
   #     name: PCA
-  #     n_components: 2
+  #     n_components: 10
   #     random_state: ${globals:random_state}
 
   knn5d_pacmap_bestkmeans: &knn5d_pacmap_bestkmeans
@@ -236,7 +236,7 @@ pipelines:
     <<: *knn5d_pacmap_bestkmeans
     reducer:
       name: PCA
-      n_components: 2
+      n_components: 10
       random_state: ${globals:random_state}
 
   mean_umap_bestkmeans:
diff --git a/src/kedro_polis_classic/pipelines/experimental/nodes.py b/src/kedro_polis_classic/pipelines/experimental/nodes.py
@@ -321,9 +321,9 @@ def _create_scatter_plot(
         # Update axis labels
         fig.update_layout(
             scene=dict(
-                xaxis_title=f"{str(x_col).upper()} Component",
-                yaxis_title=f"{str(y_col).upper()} Component",
-                zaxis_title=f"{str(z_col).upper()} Component",
+                xaxis_title=f"{str(x_col).upper()}",
+                yaxis_title=f"{str(y_col).upper()}",
+                zaxis_title=f"{str(z_col).upper()}",
             ),
             width=800,
             height=600,
@@ -360,8 +360,8 @@ def _create_scatter_plot(
 
         # Update axis labels and layout
         fig.update_layout(
-            xaxis_title=f"{str(x_col).upper()} Component",
-            yaxis_title=f"{str(y_col).upper()} Component",
+            xaxis_title=f"{str(x_col).upper()}",
+            yaxis_title=f"{str(y_col).upper()}",
             width=800,
             height=600,
             plot_bgcolor="white",
@@ -408,10 +408,7 @@ def create_scatter_plot(
     if isinstance(filter_output, np.ndarray):
         # Create generic column names based on dimensions
         n_components = filter_output.shape[1] if len(filter_output.shape) > 1 else 1
-        if n_components <= 3:
-            column_names = ["x", "y", "z"][:n_components]
-        else:
-            column_names = [f"PC{i + 1}" for i in range(n_components)]
+        column_names = [f"comp{i + 1}" for i in range(n_components)]
 
         # Create DataFrame with actual participant IDs as index
         data = pd.DataFrame(
@@ -424,6 +421,10 @@ def create_scatter_plot(
         data = filter_output.copy()
         data.index = included_participant_ids
 
+    # For plotting, only use the first 2 components (even if more are available)
+    if len(data.columns) > 2:
+        data = data.iloc[:, :2]
+
     # Convert cluster labels to pandas Series of strings for categorical coloring
     # Make sure the cluster labels have the same index as the data DataFrame
     if isinstance(clusterer_output, np.ndarray):
@@ -488,10 +489,7 @@ def create_scatter_plot_by_participant_id(
     if isinstance(filter_output, np.ndarray):
         # Create generic column names based on dimensions
         n_components = filter_output.shape[1] if len(filter_output.shape) > 1 else 1
-        if n_components <= 3:
-            column_names = ["x", "y", "z"][:n_components]
-        else:
-            column_names = [f"PC{i + 1}" for i in range(n_components)]
+        column_names = [f"comp{i + 1}" for i in range(n_components)]
 
         # Create DataFrame with actual participant IDs as index
         data = pd.DataFrame(
@@ -504,6 +502,10 @@ def create_scatter_plot_by_participant_id(
         data = filter_output.copy()
         data.index = included_participant_ids
 
+    # For plotting, only use the first 2 components (even if more are available)
+    if len(data.columns) > 2:
+        data = data.iloc[:, :2]
+
     # Get participant IDs as numeric values for continuous color scale
     participant_ids = pd.Series(data.index, index=data.index)
 
@@ -552,10 +554,7 @@ def create_scatter_plot_by_vote_proportions(
     if isinstance(filter_output, np.ndarray):
         # Create generic column names based on dimensions
         n_components = filter_output.shape[1] if len(filter_output.shape) > 1 else 1
-        if n_components <= 3:
-            column_names = ["x", "y", "z"][:n_components]
-        else:
-            column_names = [f"PC{i + 1}" for i in range(n_components)]
+        column_names = [f"comp{i + 1}" for i in range(n_components)]
 
         # Create DataFrame with actual participant IDs as index
         data = pd.DataFrame(
@@ -568,6 +567,10 @@ def create_scatter_plot_by_vote_proportions(
         data = filter_output.copy()
         data.index = included_participant_ids
 
+    # For plotting, only use the first 2 components (even if more are available)
+    if len(data.columns) > 2:
+        data = data.iloc[:, :2]
+
     # Calculate total number of votes cast by each included participant
     # Vote values: 1 = agree, -1 = disagree, 0 = pass, NaN = no vote
     # Count all non-NaN values (any vote cast) for the included participants only
@@ -689,11 +692,8 @@ def save_projections_json(
         # If it's a DataFrame, get the values
         X_clustered = filter_output.values
 
-    # Ensure we have 2D coordinates (take first 2 dimensions if more)
-    if X_clustered.shape[1] > 2:
-        X_clustered = X_clustered[:, :2]
-
-    # Create the format: [[participant_id, [x, y]], ...]
+    # Save all components to disk (don't truncate to 2D)
+    # Create the format: [[participant_id, [comp1, comp2, comp3, ...]], ...]
     X_with_ids = []
     for i, participant_id in enumerate(included_participant_ids):
         coords = X_clustered[i].tolist()
diff --git a/src/kedro_polis_classic/pipelines/polis_legacy/nodes.py b/src/kedro_polis_classic/pipelines/polis_legacy/nodes.py
@@ -145,9 +145,9 @@ def _create_scatter_plot(
         fig.update_layout(
             title=title,
             scene=dict(
-                xaxis_title=f"{x_col.upper()} Component",
-                yaxis_title=f"{y_col.upper()} Component",
-                zaxis_title=f"{z_col.upper()} Component",
+                xaxis_title=f"{str(x_col).upper()}",
+                yaxis_title=f"{str(y_col).upper()}",
+                zaxis_title=f"{str(z_col).upper()}",
             ),
             width=800,
             height=600,
@@ -174,8 +174,8 @@ def _create_scatter_plot(
 
         fig.update_layout(
             title=title,
-            xaxis_title=f"{x_col.upper()} Component",
-            yaxis_title=f"{y_col.upper()} Component",
+            xaxis_title=f"{str(x_col).upper()}",
+            yaxis_title=f"{str(y_col).upper()}",
             width=800,
             height=600,
             plot_bgcolor="white",
@@ -289,11 +289,7 @@ def reduce_with_pca(
     components = pca.fit_transform(imputed_vote_matrix)
 
     # Create column names based on number of components
-    DIMENSION_COLS = ["x", "y", "z"]
-    if n_components <= 3:
-        column_names = DIMENSION_COLS[:n_components]
-    else:
-        column_names = [f"PC{i + 1}" for i in range(n_components)]
+    column_names = [f"comp{i + 1}" for i in range(n_components)]
 
     return pd.DataFrame(
         components, index=imputed_vote_matrix.index, columns=pd.Index(column_names)