Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
213 changes: 213 additions & 0 deletions heat/cluster/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,213 @@
import heat as ht
import numpy as np


# Checks
def check_number_of_labels(n_labels, n_samples):
"""Check that number of labels are valid.

Parameters
----------
n_labels : int
Number of labels.

n_samples : int
Number of samples.
"""
if not 1 < n_labels < n_samples:
raise ValueError(
"Number of labels is %d. Valid values are 2 to n_samples - 1 (inclusive)" % n_labels
)


def check_X_y(X, y, accept_sparse=False):
"""Input validation for standard estimators.

Parameters
----------
X : {DNDarray, list, sparse matrix}
Input data.

y : {DNDarray, list, sparse matrix}
Labels.

Returns
-------
X_converted : object
The converted and validated X.

y_converted : object
The converted and validated y.
"""
X = check_array(
X,
accept_sparse=accept_sparse,
input_name="X",
)

y = _check_y(y)

check_consistent_length(X, y)

return X, y


def check_array(X, accept_sparse=False, input_name="X"):
if not isinstance(X, ht.DNDarray):
# Convert to heat array
X = ht.array(X, split=0)

if not accept_sparse and X.is_sparse:
raise TypeError(f"{input_name} is sparse, but sparse input is not accepted.")

return X


def _check_y(y):
if not isinstance(y, ht.DNDarray):
y = ht.array(y, split=0)

# need 1D labels
if len(y.shape) > 1:
if y.shape[1] == 1:
y = y.reshape(-1)
else:
raise ValueError("y should be a 1D array or a column vector.")

if y is None:
raise ValueError(" requires y to be passed, but the target y is None")

return y


def check_consistent_length(X, y):
if X.shape[0] != y.shape[0]:
raise ValueError(
f"Found input variables with inconsistent numbers of samples: [{X.shape[0]}, {y.shape[0]}]"
)

# ensure split axis is the same
if X.split != y.split:
y.resplit_(X.split)

# Check if local chunks match
if X.lshape[0] != y.lshape[0]:
raise ValueError(
"Local shapes of X and y do not match. Ensure they are partitioned identically."
)


def check_random_state(seed):
if seed is None:
return ht.random

if isinstance(seed, (int, np.integer)):
ht.random.seed(int(seed))
return ht.random

raise ValueError(f"Type {type(seed)} cannot be used to seed ht.random")


# Functions
def silhouette_samples(X, labels, *, metric="euclidean", **kwds):
X, labels = check_X_y(
X, labels, accept_sparse=["csr"]
) # think about accept_sparse, i have no idea what it is and what csr means

X_distributed = ht.array(X, split=0)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You want the input to be a Heat array that is split according to the user's need. Have a look at heat.sanitation.sanitize_in to make sure the input is a valid heat DNDarray and then don't split.

labels_distributed = ht.array(labels, split=0)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comment as for X


if metric == "precomputed":
error_msg = ValueError(
"The precomputed distance matrix contains non-zero elements on the diagonal"
)
# mb write a function to fill diag with 0, like np.fill_diagonal(X, 0)
diag_elements = ht.diag(X_distributed)

if X_distributed.dtype.kind == "f":
atol = ht.finfo(X_distributed.dtype).eps * 100 # tolerance based on machine accuracy

if ht.any(ht.abs(diag_elements) > atol):
raise error_msg
elif ht.any(diag_elements != 0): # integral dtype
raise error_msg
Comment on lines +127 to +133
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think single precision deserves special treatment here. You can check against the machine eps for any datatype.


unique_labels, labels_encoded = ht.unique(
labels_distributed, return_inverse=True
) # f.e. labels = [10, 30, 20, 10], then unique_labels = [10,20,30] and labels_encoded = [0, 2, 1, 0]
unique_labels.resplit_(None)
labels_encoded.resplit(None)
labels_freqs = ht.bincount(labels_encoded)
labels_freqs.resplit_(None)
n_samples = labels_distributed.shape[0]
n_labels = unique_labels.shape[0]
check_number_of_labels(n_labels, n_samples)

rank = X_distributed.comm.rank
"""
print(f"[{rank}] labels_encoded (Local RAM): {labels_encoded.larray}")
print(f"[{rank}] labels_distributed (Local RAM): {labels_distributed.larray}")
print(f"[{rank}] unique_labels (Local RAM): {unique_labels.larray}")
print(f"[{rank}] n_labels: {n_labels}")
print(f"[{rank}] n_samples: {n_samples}")
print(f"[{rank}] labels_freqs (Local RAM): {labels_freqs.larray}")
"""

if metric == "precomputed":
D = X_distributed
else:
D = ht.spatial.cdist(X_distributed, X_distributed)

# a(i) calculation
a_mask = ht.reshape(labels_encoded, (1, -1)) == ht.reshape(
labels_distributed, (-1, 1)
) # reshape((-1,1)) transposes labels_encoded
a_mask = a_mask.astype(ht.float32)

a_clust_dists = ht.sum(ht.mul(D, a_mask), axis=1)
denominator_a = labels_freqs.larray[labels_encoded.larray] - 1
denominator_a = ht.where(
ht.array(denominator_a, split=0).astype(ht.float32) > 0,
ht.array(denominator_a, split=0),
1.0,
)

a = ht.div(a_clust_dists, ht.array(denominator_a, split=0))

# b(i) calculation
b_mask = ht.reshape(labels_distributed, (-1, 1)) == ht.reshape(unique_labels, (1, -1))
full_b_mask = (labels_encoded.reshape((-1, 1)) == unique_labels.reshape((1, -1))).astype(
ht.float32
)

b_clust_dists = ht.matmul(D, full_b_mask)
labels_freqs.resplit_(None)
denominator_b = labels_freqs.astype(ht.float32)
denominator_b.resplit_(None)
b_clust_means = b_clust_dists / denominator_b

# we want neighbor cluster, so set the distance to points in own cluster to infinity
b_clust_means.larray[b_mask.larray > 0] = float("inf")
b = ht.min(b_clust_means, axis=1)

sil_samples = ht.div(ht.sub(b, a), ht.maximum(a, b))

sil_samples = ht.where(labels_freqs[labels_encoded] > 1, sil_samples, 0.0)
sil_samples = ht.nan_to_num(sil_samples)

return sil_samples


def silhouette_score(X, labels, *, metric="euclidean", sample_size=None, random_state=None, **kwds):
if sample_size is not None:
X, labels = check_X_y(X, labels, accept_sparse=["csc", "csr"]) # same as silhouette_samples
random_state = check_random_state(random_state)
indices = random_state.permutation(X.shape[0])[
:sample_size
] # selecs a subset of random samples, but all ranks need same indices

if metric == "precomputed": # precomputed means here distance matrix for some reason?
X, labels = X[indices].T[indices].T, labels[indices]
else:
X, labels = X[indices], labels[indices]
return float(ht.mean(silhouette_samples(X, labels, metric=metric, **kwds)))
70 changes: 70 additions & 0 deletions heat/cluster/tests/test_metrics.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have moved the tests to a different directory in #2172, please move this file there.

Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import heat as ht
import numpy as np
from sklearn.metrics import silhouette_samples as sk_silhouette
from sklearn.datasets import make_blobs
from heat.cluster.metrics import silhouette_samples


def test_silhouette_implementation():
n_samples = 100
n_features = 5
centers = 3
X_np, labels_np = make_blobs(n_samples=n_samples, n_features=n_features, centers=centers, random_state=42)

# Reference values from Scikit-Learn
sk_results = sk_silhouette(X_np, labels_np)

# HeAT values
X_ht = ht.array(X_np, split=0)
labels_ht = ht.array(labels_np, split=0)

ht_results = silhouette_samples(X_ht, labels_ht)

# Comparison

ht_results_np = ht_results.numpy()

if np.allclose(sk_results, ht_results_np, atol=1e-5):
print("✅ Test Passed: HeAT matches Scikit-Learn results.")
else:
max_diff = np.max(np.abs(sk_results - ht_results_np))
print(f"❌ Test Failed: Results differ. Max diff: {max_diff}")
#print(f"sk_results are {np.abs(sk_results)}; ht_results are {np.abs(ht_results_np)}")
Comment on lines +25 to +32
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
ht_results_np = ht_results.numpy()
if np.allclose(sk_results, ht_results_np, atol=1e-5):
print("✅ Test Passed: HeAT matches Scikit-Learn results.")
else:
max_diff = np.max(np.abs(sk_results - ht_results_np))
print(f"❌ Test Failed: Results differ. Max diff: {max_diff}")
#print(f"sk_results are {np.abs(sk_results)}; ht_results are {np.abs(ht_results_np)}")
ht_results_np = ht_results.resplit(None).numpy()
assert np.allclose(sk_results, ht_results, atol=1e-5), f'Max diff between Heat and scipy: np.max(np.abs(sk_results - ht_results_np))'


# Single sample in a cluster
labels_edge = np.array([0, 0, 0, 1])
X_edge = np.random.rand(4, n_features)

res_edge = silhouette_samples(ht.array(X_edge), ht.array(labels_edge))
if res_edge[3] == 0:
print("✅ Edge Case Passed: Single-sample cluster correctly assigned 0.0")
Comment on lines +39 to +40
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if res_edge[3] == 0:
print("✅ Edge Case Passed: Single-sample cluster correctly assigned 0.0")
assert res_edge[3] == 0:



def test_minimal_silhouette():
X_np = np.array([[0, 0], [10, 10], [20, 20], [1, 1]], dtype=np.float32)
labels_np = np.array([0, 2, 1, 0], dtype=np.int32)

ht_res = silhouette_samples(X_np, labels_np)

res_np = ht_res.numpy()

sk_results = sk_silhouette(X_np, labels_np)

print(f"Labels: {labels_np}")
print(f"HeAT Results: {res_np}")
print(f"SK Results: {sk_results}")

# Expected value for i=0 (Cluster 0)
# a = dist((0,0), (1,1)) = 1.414
# b = dist((0,0), (10,10)) = 14.14
# sil = (14.14 - 1.414) / 14.14 = 0.9

if res_np[0] > 0.8:
print(f"✅ Success! Point 0 is {res_np[0]:.4f}")
else:
print(f"❌ Failure! Point 0 is {res_np[0]:.4f}")
Comment on lines +62 to +65
Copy link
Collaborator

@brownbaerchen brownbaerchen Mar 13, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if res_np[0] > 0.8:
print(f"✅ Success! Point 0 is {res_np[0]:.4f}")
else:
print(f"❌ Failure! Point 0 is {res_np[0]:.4f}")
assert res_np[0] > 0.8, f"Point 0 is {res_np[0]:.4f}"



if __name__ == "__main__":
test_silhouette_implementation()
test_minimal_silhouette()
Loading