-
Notifications
You must be signed in to change notification settings - Fork 61
first draft of silhouette_samples that works(according to my test) #2199
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,213 @@ | ||
| import heat as ht | ||
| import numpy as np | ||
|
|
||
|
|
||
| # Checks | ||
| def check_number_of_labels(n_labels, n_samples): | ||
| """Check that number of labels are valid. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| n_labels : int | ||
| Number of labels. | ||
|
|
||
| n_samples : int | ||
| Number of samples. | ||
| """ | ||
| if not 1 < n_labels < n_samples: | ||
| raise ValueError( | ||
| "Number of labels is %d. Valid values are 2 to n_samples - 1 (inclusive)" % n_labels | ||
| ) | ||
|
|
||
|
|
||
| def check_X_y(X, y, accept_sparse=False): | ||
| """Input validation for standard estimators. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| X : {DNDarray, list, sparse matrix} | ||
| Input data. | ||
|
|
||
| y : {DNDarray, list, sparse matrix} | ||
| Labels. | ||
|
|
||
| Returns | ||
| ------- | ||
| X_converted : object | ||
| The converted and validated X. | ||
|
|
||
| y_converted : object | ||
| The converted and validated y. | ||
| """ | ||
| X = check_array( | ||
| X, | ||
| accept_sparse=accept_sparse, | ||
| input_name="X", | ||
| ) | ||
|
|
||
| y = _check_y(y) | ||
|
|
||
| check_consistent_length(X, y) | ||
|
|
||
| return X, y | ||
|
|
||
|
|
||
| def check_array(X, accept_sparse=False, input_name="X"): | ||
| if not isinstance(X, ht.DNDarray): | ||
| # Convert to heat array | ||
| X = ht.array(X, split=0) | ||
|
|
||
| if not accept_sparse and X.is_sparse: | ||
| raise TypeError(f"{input_name} is sparse, but sparse input is not accepted.") | ||
|
|
||
| return X | ||
|
|
||
|
|
||
| def _check_y(y): | ||
| if not isinstance(y, ht.DNDarray): | ||
| y = ht.array(y, split=0) | ||
|
|
||
| # need 1D labels | ||
| if len(y.shape) > 1: | ||
| if y.shape[1] == 1: | ||
| y = y.reshape(-1) | ||
| else: | ||
| raise ValueError("y should be a 1D array or a column vector.") | ||
|
|
||
| if y is None: | ||
| raise ValueError(" requires y to be passed, but the target y is None") | ||
|
|
||
| return y | ||
|
|
||
|
|
||
| def check_consistent_length(X, y): | ||
| if X.shape[0] != y.shape[0]: | ||
| raise ValueError( | ||
| f"Found input variables with inconsistent numbers of samples: [{X.shape[0]}, {y.shape[0]}]" | ||
| ) | ||
|
|
||
| # ensure split axis is the same | ||
| if X.split != y.split: | ||
| y.resplit_(X.split) | ||
|
|
||
| # Check if local chunks match | ||
| if X.lshape[0] != y.lshape[0]: | ||
| raise ValueError( | ||
| "Local shapes of X and y do not match. Ensure they are partitioned identically." | ||
| ) | ||
|
|
||
|
|
||
| def check_random_state(seed): | ||
| if seed is None: | ||
| return ht.random | ||
|
|
||
| if isinstance(seed, (int, np.integer)): | ||
| ht.random.seed(int(seed)) | ||
| return ht.random | ||
|
|
||
| raise ValueError(f"Type {type(seed)} cannot be used to seed ht.random") | ||
|
|
||
|
|
||
| # Functions | ||
| def silhouette_samples(X, labels, *, metric="euclidean", **kwds): | ||
| X, labels = check_X_y( | ||
| X, labels, accept_sparse=["csr"] | ||
| ) # think about accept_sparse, i have no idea what it is and what csr means | ||
|
|
||
| X_distributed = ht.array(X, split=0) | ||
| labels_distributed = ht.array(labels, split=0) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same comment as for |
||
|
|
||
| if metric == "precomputed": | ||
| error_msg = ValueError( | ||
| "The precomputed distance matrix contains non-zero elements on the diagonal" | ||
| ) | ||
| # mb write a function to fill diag with 0, like np.fill_diagonal(X, 0) | ||
| diag_elements = ht.diag(X_distributed) | ||
|
|
||
| if X_distributed.dtype.kind == "f": | ||
| atol = ht.finfo(X_distributed.dtype).eps * 100 # tolerance based on machine accuracy | ||
|
|
||
| if ht.any(ht.abs(diag_elements) > atol): | ||
| raise error_msg | ||
| elif ht.any(diag_elements != 0): # integral dtype | ||
| raise error_msg | ||
|
Comment on lines
+127
to
+133
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think single precision deserves special treatment here. You can check against the machine eps for any datatype. |
||
|
|
||
| unique_labels, labels_encoded = ht.unique( | ||
| labels_distributed, return_inverse=True | ||
| ) # f.e. labels = [10, 30, 20, 10], then unique_labels = [10,20,30] and labels_encoded = [0, 2, 1, 0] | ||
| unique_labels.resplit_(None) | ||
| labels_encoded.resplit(None) | ||
| labels_freqs = ht.bincount(labels_encoded) | ||
| labels_freqs.resplit_(None) | ||
| n_samples = labels_distributed.shape[0] | ||
| n_labels = unique_labels.shape[0] | ||
| check_number_of_labels(n_labels, n_samples) | ||
|
|
||
| rank = X_distributed.comm.rank | ||
| """ | ||
| print(f"[{rank}] labels_encoded (Local RAM): {labels_encoded.larray}") | ||
| print(f"[{rank}] labels_distributed (Local RAM): {labels_distributed.larray}") | ||
| print(f"[{rank}] unique_labels (Local RAM): {unique_labels.larray}") | ||
| print(f"[{rank}] n_labels: {n_labels}") | ||
| print(f"[{rank}] n_samples: {n_samples}") | ||
| print(f"[{rank}] labels_freqs (Local RAM): {labels_freqs.larray}") | ||
| """ | ||
|
|
||
| if metric == "precomputed": | ||
| D = X_distributed | ||
| else: | ||
| D = ht.spatial.cdist(X_distributed, X_distributed) | ||
|
|
||
| # a(i) calculation | ||
| a_mask = ht.reshape(labels_encoded, (1, -1)) == ht.reshape( | ||
| labels_distributed, (-1, 1) | ||
| ) # reshape((-1,1)) transposes labels_encoded | ||
| a_mask = a_mask.astype(ht.float32) | ||
|
|
||
| a_clust_dists = ht.sum(ht.mul(D, a_mask), axis=1) | ||
| denominator_a = labels_freqs.larray[labels_encoded.larray] - 1 | ||
| denominator_a = ht.where( | ||
| ht.array(denominator_a, split=0).astype(ht.float32) > 0, | ||
| ht.array(denominator_a, split=0), | ||
| 1.0, | ||
| ) | ||
|
|
||
| a = ht.div(a_clust_dists, ht.array(denominator_a, split=0)) | ||
|
|
||
| # b(i) calculation | ||
| b_mask = ht.reshape(labels_distributed, (-1, 1)) == ht.reshape(unique_labels, (1, -1)) | ||
| full_b_mask = (labels_encoded.reshape((-1, 1)) == unique_labels.reshape((1, -1))).astype( | ||
| ht.float32 | ||
| ) | ||
|
|
||
| b_clust_dists = ht.matmul(D, full_b_mask) | ||
| labels_freqs.resplit_(None) | ||
| denominator_b = labels_freqs.astype(ht.float32) | ||
| denominator_b.resplit_(None) | ||
| b_clust_means = b_clust_dists / denominator_b | ||
|
|
||
| # we want neighbor cluster, so set the distance to points in own cluster to infinity | ||
| b_clust_means.larray[b_mask.larray > 0] = float("inf") | ||
| b = ht.min(b_clust_means, axis=1) | ||
|
|
||
| sil_samples = ht.div(ht.sub(b, a), ht.maximum(a, b)) | ||
|
|
||
| sil_samples = ht.where(labels_freqs[labels_encoded] > 1, sil_samples, 0.0) | ||
| sil_samples = ht.nan_to_num(sil_samples) | ||
|
|
||
| return sil_samples | ||
|
|
||
|
|
||
| def silhouette_score(X, labels, *, metric="euclidean", sample_size=None, random_state=None, **kwds): | ||
| if sample_size is not None: | ||
| X, labels = check_X_y(X, labels, accept_sparse=["csc", "csr"]) # same as silhouette_samples | ||
| random_state = check_random_state(random_state) | ||
| indices = random_state.permutation(X.shape[0])[ | ||
| :sample_size | ||
| ] # selecs a subset of random samples, but all ranks need same indices | ||
|
|
||
| if metric == "precomputed": # precomputed means here distance matrix for some reason? | ||
| X, labels = X[indices].T[indices].T, labels[indices] | ||
| else: | ||
| X, labels = X[indices], labels[indices] | ||
| return float(ht.mean(silhouette_samples(X, labels, metric=metric, **kwds))) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We have moved the tests to a different directory in #2172, please move this file there. |
| Original file line number | Diff line number | Diff line change | ||||||||||||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,70 @@ | ||||||||||||||||||||||||
| import heat as ht | ||||||||||||||||||||||||
| import numpy as np | ||||||||||||||||||||||||
| from sklearn.metrics import silhouette_samples as sk_silhouette | ||||||||||||||||||||||||
| from sklearn.datasets import make_blobs | ||||||||||||||||||||||||
| from heat.cluster.metrics import silhouette_samples | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| def test_silhouette_implementation(): | ||||||||||||||||||||||||
| n_samples = 100 | ||||||||||||||||||||||||
| n_features = 5 | ||||||||||||||||||||||||
| centers = 3 | ||||||||||||||||||||||||
| X_np, labels_np = make_blobs(n_samples=n_samples, n_features=n_features, centers=centers, random_state=42) | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| # Reference values from Scikit-Learn | ||||||||||||||||||||||||
| sk_results = sk_silhouette(X_np, labels_np) | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| # HeAT values | ||||||||||||||||||||||||
| X_ht = ht.array(X_np, split=0) | ||||||||||||||||||||||||
| labels_ht = ht.array(labels_np, split=0) | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| ht_results = silhouette_samples(X_ht, labels_ht) | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| # Comparison | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| ht_results_np = ht_results.numpy() | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| if np.allclose(sk_results, ht_results_np, atol=1e-5): | ||||||||||||||||||||||||
| print("✅ Test Passed: HeAT matches Scikit-Learn results.") | ||||||||||||||||||||||||
| else: | ||||||||||||||||||||||||
| max_diff = np.max(np.abs(sk_results - ht_results_np)) | ||||||||||||||||||||||||
| print(f"❌ Test Failed: Results differ. Max diff: {max_diff}") | ||||||||||||||||||||||||
| #print(f"sk_results are {np.abs(sk_results)}; ht_results are {np.abs(ht_results_np)}") | ||||||||||||||||||||||||
|
Comment on lines
+25
to
+32
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| # Single sample in a cluster | ||||||||||||||||||||||||
| labels_edge = np.array([0, 0, 0, 1]) | ||||||||||||||||||||||||
| X_edge = np.random.rand(4, n_features) | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| res_edge = silhouette_samples(ht.array(X_edge), ht.array(labels_edge)) | ||||||||||||||||||||||||
| if res_edge[3] == 0: | ||||||||||||||||||||||||
| print("✅ Edge Case Passed: Single-sample cluster correctly assigned 0.0") | ||||||||||||||||||||||||
|
Comment on lines
+39
to
+40
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| def test_minimal_silhouette(): | ||||||||||||||||||||||||
| X_np = np.array([[0, 0], [10, 10], [20, 20], [1, 1]], dtype=np.float32) | ||||||||||||||||||||||||
| labels_np = np.array([0, 2, 1, 0], dtype=np.int32) | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| ht_res = silhouette_samples(X_np, labels_np) | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| res_np = ht_res.numpy() | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| sk_results = sk_silhouette(X_np, labels_np) | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| print(f"Labels: {labels_np}") | ||||||||||||||||||||||||
| print(f"HeAT Results: {res_np}") | ||||||||||||||||||||||||
| print(f"SK Results: {sk_results}") | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| # Expected value for i=0 (Cluster 0) | ||||||||||||||||||||||||
| # a = dist((0,0), (1,1)) = 1.414 | ||||||||||||||||||||||||
| # b = dist((0,0), (10,10)) = 14.14 | ||||||||||||||||||||||||
| # sil = (14.14 - 1.414) / 14.14 = 0.9 | ||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| if res_np[0] > 0.8: | ||||||||||||||||||||||||
| print(f"✅ Success! Point 0 is {res_np[0]:.4f}") | ||||||||||||||||||||||||
| else: | ||||||||||||||||||||||||
| print(f"❌ Failure! Point 0 is {res_np[0]:.4f}") | ||||||||||||||||||||||||
|
Comment on lines
+62
to
+65
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
|
|
||||||||||||||||||||||||
| if __name__ == "__main__": | ||||||||||||||||||||||||
| test_silhouette_implementation() | ||||||||||||||||||||||||
| test_minimal_silhouette() | ||||||||||||||||||||||||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You want the input to be a Heat array that is split according to the user's need. Have a look at
heat.sanitation.sanitize_into make sure the input is a valid heat DNDarray and then don't split.