Skip to content

Commit d828fd2

Browse files
MatthewMiddlehurstchrisholderpre-commit-ci-lite[bot]
authored
Distance clustering paper webpage (#125)
* remove todo * unskip notebooks * experiment kwargs, normalisation and n_clusters * skeleton docs * kwargs tests and fixes * back to aeon 3 * test threading * scaler * threaded * kwargs * RIST to rocket * results csv * temp distance page * distance webpage * coverage and fixes * fixes * bug fix plot alignment * removed extra costmatrix * [pre-commit.ci lite] apply automatic fixes * Revert "[ENH] Cluster experiment update (#124)" This reverts commit 47142b9. --------- Co-authored-by: Chris Holder <[email protected]> Co-authored-by: pre-commit-ci-lite[bot] <117423508+pre-commit-ci-lite[bot]@users.noreply.github.com>
1 parent 60a6f56 commit d828fd2

18 files changed

+1544
-8
lines changed

Diff for: tsml_eval/experiments/tests/__init__.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,18 @@
1111
from pathlib import Path
1212

1313
_CLASSIFIER_RESULTS_PATH = (
14-
os.path.dirname(Path(__file__).parent.parent) + "/test_output/classification/"
14+
os.path.dirname(Path(__file__).parent.parent.parent)
15+
+ "/test_output/classification/"
1516
)
1617

1718
_CLUSTERER_RESULTS_PATH = (
18-
os.path.dirname(Path(__file__).parent.parent) + "/test_output/clustering/"
19+
os.path.dirname(Path(__file__).parent.parent.parent) + "/test_output/clustering/"
1920
)
2021

2122
_FORECASTER_RESULTS_PATH = (
22-
os.path.dirname(Path(__file__).parent.parent) + "/test_output/forecasting/"
23+
os.path.dirname(Path(__file__).parent.parent.parent) + "/test_output/forecasting/"
2324
)
2425

2526
_REGRESSOR_RESULTS_PATH = (
26-
os.path.dirname(Path(__file__).parent.parent) + "/test_output/regression/"
27+
os.path.dirname(Path(__file__).parent.parent.parent) + "/test_output/regression/"
2728
)
Original file line numberDiff line numberDiff line change
@@ -1 +1,15 @@
11
"""Files for distance-based clustering publication."""
2+
3+
__all__ = [
4+
"_set_distance_clusterer",
5+
"_run_experiment",
6+
"distance_based_clusterers",
7+
]
8+
9+
from tsml_eval.publications.y2023.distance_based_clustering.run_distance_experiments import ( # noqa: E501
10+
_run_experiment,
11+
)
12+
from tsml_eval.publications.y2023.distance_based_clustering.set_distance_clusterer import ( # noqa: E501
13+
_set_distance_clusterer,
14+
distance_based_clusterers,
15+
)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
"""Alignment path plotting utilities."""
2+
3+
import numpy as np
4+
from aeon.distances import cost_matrix as compute_cost_matrix
5+
from aeon.distances._distance import alignment_path, pairwise_distance
6+
from aeon.utils.validation._dependencies import _check_soft_dependencies
7+
8+
9+
def _path_mask(cost_matrix, path, ax, theme=None): # pragma: no cover
10+
_check_soft_dependencies("matplotlib")
11+
12+
import matplotlib.colors as colorplt
13+
14+
if theme is None:
15+
theme = colorplt.LinearSegmentedColormap.from_list("", ["#c9cacb", "white"])
16+
17+
plot_matrix = np.zeros_like(cost_matrix)
18+
max_size = max(cost_matrix.shape)
19+
for i in range(max_size):
20+
for j in range(max_size):
21+
if (i, j) in path:
22+
plot_matrix[i, j] = 1.0
23+
elif cost_matrix[i, j] == np.inf:
24+
plot_matrix[i, j] = 0.0
25+
else:
26+
plot_matrix[i, j] = 0.25
27+
28+
for i in range(max_size):
29+
for j in range(max_size):
30+
c = cost_matrix[j, i]
31+
ax.text(i, j, str(round(c, 2)), va="center", ha="center", size=10)
32+
ax.text(i, j, str(round(c, 2)), va="center", ha="center", size=10)
33+
34+
ax.matshow(plot_matrix, cmap=theme)
35+
36+
37+
def _pairwise_path(x, y, metric): # pragma: no cover
38+
pw_matrix = pairwise_distance(x, y, metric=metric)
39+
path = []
40+
for i in range(pw_matrix.shape[0]):
41+
for j in range(pw_matrix.shape[1]):
42+
if i == j:
43+
path.append((i, j))
44+
return path, pw_matrix.trace(), pw_matrix
45+
46+
47+
def _plot_path( # pragma: no cover
48+
x: np.ndarray,
49+
y: np.ndarray,
50+
metric: str,
51+
dist_kwargs: dict = None,
52+
title: str = "",
53+
plot_over_pw: bool = False,
54+
):
55+
_check_soft_dependencies("matplotlib")
56+
57+
import matplotlib.pyplot as plt
58+
59+
if dist_kwargs is None:
60+
dist_kwargs = {}
61+
try:
62+
path, dist = alignment_path(x, y, metric=metric, **dist_kwargs)
63+
cost_matrix = compute_cost_matrix(x, y, metric=metric, **dist_kwargs)
64+
65+
if metric == "lcss":
66+
_path = []
67+
for tup in path:
68+
_path.append(tuple(x + 1 for x in tup))
69+
path = _path
70+
71+
if plot_over_pw is True:
72+
if metric == "lcss":
73+
pw = pairwise_distance(x, y, metric="euclidean")
74+
cost_matrix = np.zeros_like(cost_matrix)
75+
cost_matrix[1:, 1:] = pw
76+
else:
77+
pw = pairwise_distance(x, y, metric="squared")
78+
cost_matrix = pw
79+
except NotImplementedError:
80+
path, dist, cost_matrix = _pairwise_path(x, y, metric)
81+
82+
plt.figure(1, figsize=(8, 8))
83+
x_size = x.shape[0]
84+
85+
# definitions for the axes
86+
left, bottom = 0.01, 0.1
87+
w_ts = h_ts = 0.2
88+
left_h = left + w_ts + 0.02
89+
width = height = 0.65
90+
bottom_h = bottom + height + 0.02
91+
92+
rect_s_y = [left, bottom, w_ts, height]
93+
rect_gram = [left_h, bottom, width, height]
94+
rect_s_x = [left_h, bottom_h, width, h_ts]
95+
96+
ax_gram = plt.axes(rect_gram)
97+
ax_s_x = plt.axes(rect_s_x)
98+
ax_s_y = plt.axes(rect_s_y)
99+
100+
_path_mask(cost_matrix, path, ax_gram)
101+
ax_gram.axis("off")
102+
ax_gram.autoscale(False)
103+
# ax_gram.plot([j for (i, j) in path], [i for (i, j) in path], "w-",
104+
# linewidth=3.)
105+
106+
ax_s_x.plot(np.arange(x_size), y, "b-", linewidth=3.0, color="#818587")
107+
ax_s_x.axis("off")
108+
ax_s_x.set_xlim((0, x_size - 1))
109+
110+
ax_s_y.plot(-x, np.arange(x_size), "b-", linewidth=3.0, color="#818587")
111+
ax_s_y.axis("off")
112+
ax_s_y.set_ylim((0, x_size - 1))
113+
114+
ax_s_x.set_title(title, size=10)
115+
116+
return plt
117+
118+
119+
def _plot_alignment( # pragma: no cover
120+
x, y, metric, dist_kwargs: dict = None, title: str = ""
121+
):
122+
_check_soft_dependencies("matplotlib")
123+
124+
import matplotlib.pyplot as plt
125+
126+
if dist_kwargs is None:
127+
dist_kwargs = {}
128+
try:
129+
path, dist = alignment_path(x, y, metric=metric, **dist_kwargs)
130+
except NotImplementedError:
131+
path, dist, cost_matrix = _pairwise_path(x, y, metric)
132+
133+
plt.figure(1, figsize=(8, 8))
134+
135+
plt.plot(x, "b-", color="black")
136+
plt.plot(y, "g-", color="black")
137+
138+
for positions in path:
139+
try:
140+
plt.plot(
141+
[positions[0], positions[1]],
142+
[x[positions[0]], y[positions[1]]],
143+
"--",
144+
color="#818587",
145+
)
146+
except Exception:
147+
continue
148+
plt.title(title)
149+
150+
plt.tight_layout()
151+
return plt
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
"""Code for tuning clusterer parameters used in the publication."""
2+
3+
import sys
4+
5+
import numpy as np
6+
from aeon.clustering.k_means import TimeSeriesKMeans
7+
from sklearn.metrics import davies_bouldin_score
8+
9+
10+
# used for dtw and wdtw primarily
11+
def _tune_window(metric, train_X, n_clusters): # pragma: no cover
12+
best_w = 0
13+
best_score = sys.float_info.max
14+
for w in np.arange(0.0, 0.2, 0.01):
15+
cls = TimeSeriesKMeans(
16+
metric=metric, distance_params={"window": w}, n_clusters=n_clusters
17+
)
18+
cls.fit(train_X)
19+
preds = cls.predict(train_X)
20+
clusters = len(np.unique(preds))
21+
if clusters <= 1:
22+
score = sys.float_info.max
23+
else:
24+
score = davies_bouldin_score(train_X, preds)
25+
print(f" Number of clusters = {clusters} window = {w} score = {score}")
26+
if score < best_score:
27+
best_score = score
28+
best_w = w
29+
print("best window =", best_w, " with score ", best_score)
30+
return best_w
31+
32+
33+
def _tune_msm(train_X, n_clusters): # pragma: no cover
34+
best_c = 0
35+
best_score = sys.float_info.max
36+
for c in np.arange(0.0, 5.0, 0.25):
37+
cls = TimeSeriesKMeans(
38+
metric="msm", distance_params={"c": c}, n_clusters=n_clusters
39+
)
40+
cls.fit(train_X)
41+
preds = cls.predict(train_X)
42+
clusters = len(np.unique(preds))
43+
if clusters <= 1:
44+
score = sys.float_info.max
45+
else:
46+
score = davies_bouldin_score(train_X, preds)
47+
print(f" Number of clusters = {clusters} c parameter = {c} score = {score}")
48+
if score < best_score:
49+
best_score = score
50+
best_c = c
51+
print("best c =", best_c, " with score ", best_score)
52+
return best_c
53+
54+
55+
def _tune_wdtw(train_X, n_clusters): # pragma: no cover
56+
best_g = 0
57+
best_score = sys.float_info.max
58+
for g in np.arange(0.0, 1.0, 0.05):
59+
cls = TimeSeriesKMeans(
60+
metric="wdtw", distance_params={"g": g}, n_clusters=n_clusters
61+
)
62+
cls.fit(train_X)
63+
preds = cls.predict(train_X)
64+
clusters = len(np.unique(preds))
65+
if clusters <= 1:
66+
score = sys.float_info.max
67+
else:
68+
score = davies_bouldin_score(train_X, preds)
69+
print(f" Number of clusters = {clusters} g parameter = {g} score = {score}")
70+
if score < best_score:
71+
best_score = score
72+
best_g = g
73+
print("best g =", best_g, " with score ", best_score)
74+
return best_g
75+
76+
77+
def _tune_twe(train_X, n_clusters): # pragma: no cover
78+
best_nu = 0
79+
best_lambda = 0
80+
best_score = sys.float_info.max
81+
for nu in np.arange(0.0, 1.0, 0.25):
82+
for lam in np.arange(0.0, 1.0, 0.2):
83+
cls = TimeSeriesKMeans(
84+
metric="twe",
85+
distance_params={"nu": nu, "lmbda": lam},
86+
n_clusters=n_clusters,
87+
)
88+
cls.fit(train_X)
89+
preds = cls.predict(train_X)
90+
clusters = len(np.unique(preds))
91+
if clusters <= 1:
92+
score = sys.float_info.max
93+
else:
94+
score = davies_bouldin_score(train_X, preds)
95+
print(
96+
f" Number of clusters = {clusters} nu param = {nu} lambda para "
97+
f"= {lam} score = {score}"
98+
) #
99+
# noqa
100+
if score < best_score:
101+
best_score = score
102+
best_nu = nu
103+
best_lambda = lam
104+
print("best nu =", best_nu, f" lambda = {best_lambda} score ", best_score) # noqa
105+
return best_nu, best_lambda
106+
107+
108+
def _tune_erp(train_X, n_clusters): # pragma: no cover
109+
best_g = 0
110+
best_score = sys.float_info.max
111+
for g in np.arange(0.0, 2.0, 0.2):
112+
cls = TimeSeriesKMeans(
113+
metric="erp", distance_params={"g": g}, n_clusters=n_clusters
114+
)
115+
cls.fit(train_X)
116+
preds = cls.predict(train_X)
117+
clusters = len(np.unique(preds))
118+
if clusters <= 1:
119+
score = sys.float_info.max
120+
else:
121+
score = davies_bouldin_score(train_X, preds)
122+
print(f" Number of clusters ={clusters} g parameter = {g} score = {score}")
123+
if score < best_score:
124+
best_score = score
125+
best_g = g
126+
print("best g =", best_g, " with score ", best_score)
127+
return best_g
128+
129+
130+
def _tune_edr(train_X, n_clusters): # pragma: no cover
131+
best_e = 0
132+
best_score = sys.float_info.max
133+
for e in np.arange(0.0, 0.2, 0.01):
134+
cls = TimeSeriesKMeans(
135+
metric="edr", distance_params={"epsilon": e}, n_clusters=n_clusters
136+
)
137+
cls.fit(train_X)
138+
preds = cls.predict(train_X)
139+
clusters = len(np.unique(preds))
140+
if clusters <= 1:
141+
score = sys.float_info.max
142+
else:
143+
score = davies_bouldin_score(train_X, preds)
144+
print(
145+
f" Number of clusters = {clusters} epsilon parameter = {e} score = {score}"
146+
)
147+
if score < best_score:
148+
best_score = score
149+
best_e = e
150+
print("best e =", best_e, " with score ", best_score) # noqa
151+
return best_e
152+
153+
154+
def _tune_lcss(train_X, n_clusters): # pragma: no cover
155+
best_e = 0
156+
best_score = sys.float_info.max
157+
for e in np.arange(0.0, 0.2, 0.01):
158+
cls = TimeSeriesKMeans(
159+
metric="lcss", distance_params={"epsilon": e}, n_clusters=n_clusters
160+
)
161+
cls.fit(train_X)
162+
preds = cls.predict(train_X)
163+
clusters = len(np.unique(preds))
164+
if clusters <= 1:
165+
score = sys.float_info.max
166+
else:
167+
score = davies_bouldin_score(train_X, preds)
168+
print(
169+
f" Number of clusters ={clusters} epsilon parameter = {e} score = {score}"
170+
)
171+
if score < best_score:
172+
best_score = score
173+
best_e = e
174+
print("best e =", best_e, " with score ", best_score)
175+
return best_e

0 commit comments

Comments
 (0)