Skip to content

Commit 524ab69

Browse files
committed
Remove functions for old ported PCA code, and unused imputation.
1 parent ee6ca7a commit 524ab69

7 files changed

Lines changed: 6 additions & 207 deletions

File tree

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@
4141
- Add support for `Loader()` importing data from alternative Polis instances via `polis_instance_url` arg.
4242
- Patch sklearn with a simple `PatchedPipeline`, to allow pipeline steps to access other steps.
4343
- Modify `SparsityAwareScaler` to be able to use captured output from SparsityAware Capture.
44+
- Remove ported Polis PCA functions that are no longer used.
45+
- Remove old `impute_missing_votes()` function that's no longer used.
4446

4547
### Chores
4648
- Moved agora implementation from `reddwarf.agora` to `reddwarf.implementations.agora` (deprecation warning).

docs/api_reference.md

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -77,10 +77,6 @@ use in Scikit-Learn workflows, pipelines, and APIs.
7777
options:
7878
show_root_heading: true
7979

80-
### ::: reddwarf.utils.impute_missing_votes
81-
options:
82-
show_root_heading: true
83-
8480
### ::: reddwarf.utils.get_unvoted_statement_ids
8581
options:
8682
show_root_heading: true

reddwarf/utils/matrix.py

Lines changed: 0 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -7,34 +7,6 @@
77

88
VoteMatrix: TypeAlias = pd.DataFrame
99

10-
def impute_missing_votes(vote_matrix: VoteMatrix) -> VoteMatrix:
11-
"""
12-
Imputes missing votes in a voting matrix using column-wise mean. All columns must have at least one vote.
13-
14-
Reference:
15-
Small, C. (2021). "Polis: Scaling Deliberation by Mapping High Dimensional Opinion Spaces."
16-
Specific highlight: <https://hyp.is/8zUyWM5fEe-uIO-J34vbkg/gwern.net/doc/sociology/2021-small.pdf>
17-
18-
Args:
19-
vote_matrix (pd.DataFrame): A vote matrix DataFrame with `NaN`/`None` values where: \
20-
1. rows are voters, \
21-
2. columns are statements, and \
22-
3. values are votes.
23-
24-
Returns:
25-
imputed_matrix (pd.DataFrame): The same vote matrix DataFrame imputing missing values with column mean.
26-
"""
27-
if vote_matrix.isna().all(axis="rows").any():
28-
raise RedDwarfError("impute_missing_votes does not support vote matrices containing statement columns with no votes.")
29-
30-
mean_imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
31-
imputed_matrix = pd.DataFrame(
32-
mean_imputer.fit_transform(vote_matrix),
33-
columns=vote_matrix.columns,
34-
index=vote_matrix.index,
35-
)
36-
return imputed_matrix
37-
3810
def filter_votes(
3911
votes: List[Dict],
4012
cutoff: Optional[int] = None,

reddwarf/utils/pca.py

Lines changed: 2 additions & 94 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import pandas as pd
33
import numpy as np
44
from reddwarf.utils.matrix import VoteMatrix, generate_virtual_vote_matrix
5-
from reddwarf.sklearn.transformers import SparsityAwareCapturer, SparsityAwareScaler, calculate_scaling_factors
5+
from reddwarf.sklearn.transformers import SparsityAwareCapturer, SparsityAwareScaler
66
from reddwarf.sklearn.pipeline import PatchedPipeline
77
from typing import Tuple
88

@@ -67,99 +67,7 @@ def run_pca(
6767

6868
return projected_participants, projected_statements, pca
6969

70-
# TODO: Clean up variables and docs.
71-
def sparsity_aware_project_ptpt(participant_votes, statement_components, statement_means):
72-
"""
73-
Projects a sparse vote vector into PCA space while adjusting for sparsity.
74-
75-
Args:
76-
participant_votes (list): List of participant votes on each statement
77-
statement_components (list[list[float]]): Two lists of floats corresponding to the two principal components
78-
statement_means (list[float]): List of floats corresponding to the centers/means of each statement
79-
80-
Returns:
81-
projected_coords (list[list[float]]): Two lists corresponding to projected xy coordinates.
82-
"""
83-
statement_components = np.array(statement_components) # Shape: (2, n_features)
84-
statement_means = np.array(statement_means) # Shape: (n_features,)
85-
86-
participant_votes = np.array(participant_votes)
87-
mask = ~np.isnan(participant_votes) # Only consider non-null values
88-
89-
# Extract relevant values
90-
x_vals = participant_votes[mask] - statement_means[mask] # Centered values
91-
# TODO: Extend this to work in 3D
92-
pc1_vals, pc2_vals = statement_components[:, mask] # Select only used components
93-
94-
# Compute dot product projection
95-
p1 = np.dot(x_vals, pc1_vals)
96-
p2 = np.dot(x_vals, pc2_vals)
97-
98-
scaling_factor = calculate_scaling_factors(participant_votes)
99-
100-
coord_projected = np.array([p1, p2])
101-
coord_scaled = coord_projected * scaling_factor
102-
103-
return coord_scaled
104-
105-
# TODO: Clean up variables and docs.
106-
def sparsity_aware_project_ptpts(vote_matrix, statement_components, statement_means):
107-
"""
108-
Apply sparsity-aware projection to multiple vote vectors.
109-
"""
110-
return np.array([
111-
sparsity_aware_project_ptpt(participant_votes, statement_components, statement_means)
112-
for participant_votes in vote_matrix]
113-
)
114-
115-
# TODO: Clean up variables and docs.
116-
def pca_project_cmnts(statement_components, statement_means):
117-
"""
118-
Projects unit vectors for each feature into PCA space to understand their placement.
119-
"""
120-
121-
# Create a matrix of virtual participants that each vote once on a single statement.
122-
123-
# Build an basic identity matrix
124-
n_statements = len(statement_means)
125-
virtual_vote_matrix = np.eye(n_statements)
126-
127-
# Replace 1s with -1 and 0s with NaN
128-
# TODO: Why does Polis use -1 (disagree) here? is it the same? BUG?
129-
AGREE_VAL = 1
130-
MISSING_VAL = np.nan
131-
virtual_vote_matrix = np.where(virtual_vote_matrix == 1, AGREE_VAL, MISSING_VAL)
132-
133-
statement_projections = sparsity_aware_project_ptpts(
134-
virtual_vote_matrix,
135-
statement_components,
136-
statement_means,
137-
)
138-
139-
return statement_projections
140-
14170
def calculate_extremity(projections: ArrayLike):
14271
# Compute extremity as vector magnitude on rows.
14372
# vector magnitude = Euclidean norm = hypotenuse of xy
144-
return np.linalg.norm(projections, axis=0)
145-
146-
# TODO: Clean up variables and docs.
147-
def with_proj_and_extremity(pca):
148-
"""
149-
Compute projection and extremity, then merge into PCA results.
150-
"""
151-
statement_projections = pca_project_cmnts(
152-
statement_components=pca["comps"],
153-
statement_means=pca["center"],
154-
)
155-
156-
# Flip the axes to get all x together and y together.
157-
# 2 sets of 40. shape (2, 40)
158-
statement_projections = statement_projections.transpose()
159-
160-
statement_extremities = calculate_extremity(statement_projections)
161-
162-
pca["comment-projection"] = statement_projections.tolist()
163-
pca["comment-extremity"] = statement_extremities.tolist()
164-
165-
return pca
73+
return np.linalg.norm(projections, axis=0)

tests/implementations/test_polis_legacy.py

Lines changed: 0 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -82,21 +82,6 @@ def test_statement_count(polis_convo_data):
8282

8383
assert client.statement_count == expected_data
8484

85-
@pytest.mark.parametrize("polis_convo_data", ["small-no-meta", "small-with-meta", "medium-with-meta", "medium-no-meta"], indirect=True)
86-
def test_impute_missing_values(polis_convo_data):
87-
fixture = polis_convo_data
88-
client = PolisClient(is_strict_moderation=False)
89-
client.load_data(filepaths=[
90-
f'{fixture.data_dir}/votes.json',
91-
f'{fixture.data_dir}/comments.json',
92-
])
93-
matrix_with_missing = client.get_matrix(is_filtered=True)
94-
matrix_without_missing = utils.impute_missing_votes(matrix_with_missing)
95-
96-
assert matrix_with_missing.isnull().values.sum() > 0
97-
assert matrix_without_missing.isnull().values.sum() == 0
98-
assert matrix_with_missing.shape == matrix_without_missing.shape
99-
10085
# This test can't be paramtrized without changes.
10186
@pytest.mark.parametrize("polis_convo_data", ["small"], indirect=True)
10287
def test_filtered_participants_grouped(polis_convo_data):

tests/test_utils.py

Lines changed: 1 addition & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -267,47 +267,4 @@ def test_filter_matrix_mod_out_statement_with_zero(make_vote_matrix):
267267
)
268268
assert statement_count(initial_matrix) == statement_count(filtered_matrix)
269269

270-
assert (filtered_matrix[1] == 0).all()
271-
272-
def test_impute_missing_votes():
273-
# Manually calculated
274-
expected_matrix = pd.DataFrame(
275-
[
276-
[ 0, 1, 0, 1/3],
277-
[-1, 0, 0, -1],
278-
[-1, 0.5, 0, 1],
279-
[-1, 0.5, 0, 1],
280-
],
281-
columns=[0, 1, 2, 3], # statement_ids
282-
index=[0, 1, 2, 3], # participant_ids
283-
dtype=float,
284-
)
285-
286-
initial_matrix = pd.DataFrame(
287-
[
288-
[ 0, 1, None, None],
289-
[-1, 0, None, -1],
290-
[-1, None, None, 1],
291-
[-1, None, 0, 1],
292-
],
293-
columns=[0, 1, 2, 3], # statement_ids
294-
index=[0, 1, 2, 3], # participant_ids
295-
dtype=float,
296-
)
297-
imputed_matrix = utils.impute_missing_votes(vote_matrix=initial_matrix)
298-
assert_frame_equal(imputed_matrix, expected_matrix)
299-
300-
def test_impute_missing_votes_no_vote_statement_error():
301-
initial_matrix = pd.DataFrame(
302-
[
303-
[ 0, 1, None, None],
304-
[-1, 0, None, -1],
305-
[-1, None, None, 1],
306-
[-1, None, None, 1],
307-
],
308-
columns=[0, 1, 2, 3], # statement_ids
309-
index=[0, 1, 2, 3], # participant_ids
310-
dtype=float,
311-
)
312-
with pytest.raises(RedDwarfError):
313-
utils.impute_missing_votes(vote_matrix=initial_matrix)
270+
assert (filtered_matrix[1] == 0).all()

tests/utils/test_pca.py

Lines changed: 1 addition & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -144,10 +144,6 @@ def test_run_pca_real_data_below_100(polis_convo_data):
144144
pid = expected["participant_id"]
145145
assert_array_almost_equal(actual_projected_participants.loc[pid, ["x","y"]].values, expected["xy"])
146146

147-
# print(actual_projected_statements.values.transpose())
148-
# print(expected_pca["comment-projection"])
149-
# print(PcaUtils.pca_project_cmnts(actual_pca.components_, actual_pca.mean_).transpose())
150-
# print(PcaUtils.pca_project_cmnts(expected_pca["comps"], np.asarray(expected_pca["center"])).transpose())
151147
assert_array_almost_equal(actual_projected_statements.values.transpose(), expected_pca["comment-projection"])
152148

153149
# TODO: Accomodate sign flips in "medium-no-meta".
@@ -212,21 +208,4 @@ def test_run_pca_real_data_testing():
212208
assert actual_pca.components_[0] == pytest.approx(expected_pca["comps"][0])
213209
assert actual_pca.components_[1] == pytest.approx(expected_pca["comps"][1])
214210

215-
assert actual_pca.mean_ == pytest.approx(expected_pca["center"])
216-
217-
@pytest.mark.parametrize("polis_convo_data", ["small-no-meta", "small-with-meta", "medium-with-meta"], indirect=True)
218-
def test_with_proj_and_extremity(polis_convo_data):
219-
fixture = polis_convo_data
220-
# Invert to correct for flipped signs in polismath.
221-
math_data = helpers.flip_signs_by_key(nested_dict=fixture.math_data, keys=["pca.center", "pca.comment-projection", "base-clusters.x", "base-clusters.y", "group-clusters[*].center"])
222-
expected_pca = math_data["pca"]
223-
224-
minimal_pca = {
225-
"center": expected_pca["center"],
226-
"comps": expected_pca["comps"],
227-
}
228-
229-
calculated_pca = PcaUtils.with_proj_and_extremity(pca=minimal_pca)
230-
231-
assert expected_pca["comment-projection"] == calculated_pca["comment-projection"]
232-
assert expected_pca["comment-extremity"] == calculated_pca["comment-extremity"]
211+
assert actual_pca.mean_ == pytest.approx(expected_pca["center"])

0 commit comments

Comments
 (0)