Merge pull request #117 from polis-community/116-fix-bestkmeans

patcon · web-flow · commit e73a66579a84 · 2026-02-04T19:32:59.000-05:00
Improve error message for invalid init strategy
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -62,3 +62,7 @@ Tests use real Polis API data fixtures in `tests/fixtures/`. The test suite incl
 - `plots`: matplotlib, seaborn, concave-hull (visualization)
 - `dev`: pytest, mkdocs, nbmake (development)
 - `all`: everything
+
+## Git Conventions
+
+- When working on a branch that references an issue (e.g., `116-fix-bestkmeans`), include `Closes #116` in the commit message or PR description to auto-close the issue when merged
diff --git a/reddwarf/sklearn/cluster.py b/reddwarf/sklearn/cluster.py
@@ -1,4 +1,4 @@
-from typing import List, Optional
+from typing import List, Literal, Optional
 
 import numpy as np
 from numpy.typing import ArrayLike, NDArray
@@ -9,6 +9,9 @@
 
 from reddwarf.sklearn.model_selection import GridSearchNonCV
 
+InitStrategy = Literal["k-means++", "random", "polis"]
+VALID_INIT_STRATEGIES: List[str] = ["k-means++", "random", "polis"]
+
 
 def _to_range(r) -> range:
     """
@@ -76,8 +79,8 @@ class PolisKMeans(KMeans):
     def __init__(
         self,
         n_clusters=8,
-        init="k-means++",  # or 'random', 'polis'
-        init_centers: Optional[ArrayLike] = None,  # array-like, optional
+        init: InitStrategy = "k-means++",
+        init_centers: Optional[ArrayLike] = None,
         n_init="auto",
         max_iter=300,
         tol=1e-4,
@@ -120,7 +123,10 @@ def _generate_centers(self, X, x_squared_norms, n_to_generate, random_state) ->
                 raise ValueError("Not enough unique rows in X for 'polis' strategy.")
             centers = unique_X[:n_to_generate]
         else:
-            raise ValueError(f"Unsupported init strategy: {self._init_strategy}")
+            raise ValueError(
+                f"Unsupported init strategy: {self._init_strategy!r}. "
+                f"Valid options are: {VALID_INIT_STRATEGIES}"
+            )
         return centers
 
     def fit(self, X, y=None, sample_weight=None):
@@ -178,7 +184,7 @@ def __init__(
         self,
         n_clusters: int = 100,
         random_state: Optional[int] = None,
-        init: str = "k-means++",
+        init: InitStrategy = "k-means++",
         init_centers: Optional[ArrayLike] = None,
     ):
         self.n_clusters = n_clusters
@@ -232,7 +238,7 @@ class BestPolisKMeans(BaseEstimator):
     def __init__(
         self,
         k_bounds: Optional[List[int]] = None,
-        init: str = "polis",
+        init: InitStrategy = "polis",
         init_centers: Optional[ArrayLike] = None,
         random_state: Optional[int] = None,
     ):
diff --git a/reddwarf/utils/clusterer/kmeans.py b/reddwarf/utils/clusterer/kmeans.py
@@ -2,7 +2,7 @@
 import pandas as pd
 import numpy as np
 from reddwarf.sklearn.model_selection import GridSearchNonCV
-from reddwarf.sklearn.cluster import PolisKMeans
+from reddwarf.sklearn.cluster import InitStrategy, PolisKMeans
 from sklearn.metrics import silhouette_score
 from typing import List, Optional
 
@@ -33,7 +33,7 @@ def to_range(r: RangeLike) -> range:
 def run_kmeans(
         dataframe: pd.DataFrame,
         n_clusters: int = 2,
-        init="k-means++",
+        init: InitStrategy = "k-means++",
         # TODO: Improve this type. 3d?
         init_centers: Optional[List] = None,
         random_state: Optional[int] = None,
@@ -66,7 +66,7 @@ def run_kmeans(
 def find_best_kmeans(
         X_to_cluster: NDArray,
         k_bounds: RangeLike = [2, 5],
-        init="k-means++",
+        init: InitStrategy = "k-means++",
         init_centers: Optional[List] = None,
         random_state: Optional[int] = None,
 ) -> tuple[int, float, PolisKMeans | None]:
diff --git a/tests/sklearn/test_cluster.py b/tests/sklearn/test_cluster.py
@@ -184,12 +184,12 @@ def test_init_centers_wrong_n_features(self, simple_data):
             pkm.fit(X)
 
     def test_unsupported_init_strategy(self, simple_data):
-        """Test that unsupported init strategy raises error."""
+        """Test that unsupported init strategy raises error with valid options."""
         X, _ = simple_data
-        pkm = PolisKMeans(n_clusters=3, init="invalid")
+        pkm = PolisKMeans(n_clusters=3, init="k-means++")
         pkm._init_strategy = "invalid"  # Bypass __init__ validation
 
-        with pytest.raises(ValueError, match="Unsupported init strategy"):
+        with pytest.raises(ValueError, match=r"Unsupported init strategy.*k-means\+\+.*random.*polis"):
             pkm.fit(X)
 
     def test_reproducibility_with_random_state(self, simple_data):