Skip to content

Commit 9556b54

Browse files
authored
More thorough testing (#142)
* covar cleanup * test bdc * kcca: use sqrt-eigenvalues * fix lambda * box_discretization * docs / notebooks update * tests
1 parent d70bbb0 commit 9556b54

37 files changed

+776
-449
lines changed

deeptime/clustering/__init__.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
KMeans
1313
MiniBatchKMeans
1414
RegularSpace
15+
BoxDiscretization
1516
1617
1718
===============================================================================
@@ -24,6 +25,18 @@
2425
2526
ClusterModel
2627
KMeansModel
28+
BoxDiscretizationModel
29+
30+
31+
===============================================================================
32+
Functions
33+
===============================================================================
34+
35+
.. autosummary::
36+
:toctree: generated/
37+
:template: class_nomodule.rst
38+
39+
kmeans_plusplus
2740
2841
2942
===============================================================================
@@ -34,7 +47,7 @@
3447
:toctree: generated/
3548
:template: class_nomodule.rst
3649
37-
_clustering_bindings.Metric
50+
Metric
3851
metrics
3952
MetricRegistry
4053
"""
@@ -43,4 +56,7 @@
4356
from ._clustering_bindings import Metric
4457
from ._kmeans import KMeans, MiniBatchKMeans, KMeansModel
4558
from ._regspace import RegularSpace
59+
from ._box import BoxDiscretization, BoxDiscretizationModel
4660
from ._cluster_model import ClusterModel
61+
62+
from ._kmeans import kmeans_plusplus

deeptime/clustering/_box.py

+111
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
from typing import Optional
2+
3+
import numpy as np
4+
5+
from ._cluster_model import ClusterModel
6+
from ..base import Estimator
7+
8+
9+
class BoxDiscretizationModel(ClusterModel):
10+
r""" Model produced by :class:`BoxDiscretization`. Can be used to discretize and/or one-hot transform data.
11+
12+
Parameters
13+
----------
14+
cluster_centers : ndarray
15+
The cluster centers.
16+
v0 : ndarray
17+
Lower left vertex of box.
18+
v1 : ndarray
19+
Upper right vertex of box.
20+
n_boxes : int
21+
Number of boxes.
22+
"""
23+
24+
def __init__(self, cluster_centers: np.ndarray, v0, v1, n_boxes):
25+
super().__init__(cluster_centers)
26+
self.v0 = v0
27+
self.v1 = v1
28+
self.n_boxes = n_boxes
29+
30+
def transform_onehot(self, data, n_jobs=None):
31+
r"""Transforms data into discrete states with subsequent one-hot encoding.
32+
33+
Parameters
34+
----------
35+
data : ndarray
36+
Input data
37+
n_jobs : int or None, optional, default=None
38+
Number of jobs.
39+
40+
Returns
41+
-------
42+
one_hot : ndarray
43+
A (T, n_boxes) shaped array with one-hot encoded data.
44+
"""
45+
dtraj = self.transform(data, n_jobs=n_jobs)
46+
traj_onehot = np.zeros((len(data), self.n_clusters))
47+
traj_onehot[np.arange(len(data)), dtraj] = 1.
48+
return traj_onehot
49+
50+
51+
class BoxDiscretization(Estimator):
52+
r"""An n-dimensional box discretization of Euclidean space.
53+
54+
It spans an n-dimensional grid based on linspaces along each axis which is then used as cluster centers.
55+
The linspaces are bounded either by the user (attributes :attr:`v0` and :attr:`v1`) or estimated from data.
56+
57+
Parameters
58+
----------
59+
dim : int
60+
Dimension of the box discretization.
61+
n_boxes : int or list of int
62+
Number of boxes per dimension of - if given as single integer - for all dimensions.
63+
v0 : array or None, optional, default=None
64+
Lower left vertex of the box discretization. If not given this is estimated from data.
65+
v1 : array or None, optional, default=None
66+
Upper right vertex of the box discretization. If not given this is estimated from data.
67+
"""
68+
69+
def __init__(self, dim: int, n_boxes, v0=None, v1=None):
70+
super().__init__()
71+
if not isinstance(n_boxes, (list, tuple, np.ndarray)):
72+
if int(n_boxes) == n_boxes:
73+
n_boxes = [int(n_boxes)] * dim
74+
if len(n_boxes) != dim:
75+
raise ValueError(f"Dimension and number of boxes per dimension did not match ({len(n_boxes)} and {dim}).")
76+
if v0 is not None and len(v0) != dim:
77+
raise ValueError("Length of v0 did not match dimension.")
78+
if v1 is not None and len(v1) != dim:
79+
raise ValueError("Length of v1 did not match dimension.")
80+
self.dim = dim
81+
self.n_boxes = n_boxes
82+
self.v0 = v0
83+
self.v1 = v1
84+
85+
def fit(self, data: np.ndarray, **kwargs):
86+
assert data.shape[1] == self.dim
87+
if self.v0 is None or self.v1 is None:
88+
v0 = np.empty((self.dim,), dtype=data.dtype) if self.v0 is None else self.v0
89+
v1 = np.empty((self.dim,), dtype=data.dtype) if self.v1 is None else self.v1
90+
for d in range(self.dim):
91+
if self.v0 is None:
92+
v0[d] = np.min(data[:, d])
93+
if self.v1 is None:
94+
v1[d] = np.max(data[:, d])
95+
else:
96+
v0 = self.v0
97+
v1 = self.v1
98+
linspaces = [np.linspace(v0[d], v1[d], num=self.n_boxes[d], endpoint=True) for d in range(self.dim)]
99+
mesh = np.vstack(np.meshgrid(*tuple(linspaces))).reshape(self.dim, -1).T
100+
self._model = BoxDiscretizationModel(mesh, v0, v1, self.n_boxes)
101+
return self
102+
103+
def fetch_model(self) -> Optional[BoxDiscretizationModel]:
104+
r""" Yields the estimated model.
105+
106+
Returns
107+
-------
108+
model : BoxDiscretizationModel or None
109+
The model.
110+
"""
111+
return super().fetch_model()

deeptime/clustering/_kmeans.py

+37-5
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,44 @@
88
from ._cluster_model import ClusterModel
99
from . import _clustering_bindings as _bd, metrics
1010

11-
__all__ = ['KMeans', 'MiniBatchKMeans', 'KMeansModel']
12-
1311
from ..util.parallel import handle_n_jobs
1412

1513

14+
def kmeans_plusplus(data, n_clusters: int, metric: str = 'euclidean', callback=None, seed: int = -1,
15+
n_jobs: Optional[int] = None):
16+
r""" Performs kmeans++ initialization. :footcite:`arthur2006k`
17+
18+
Parameters
19+
----------
20+
data : np.ndarray
21+
Input data of shape (T, n_dim).
22+
n_clusters : int
23+
The number of cluster centers.
24+
metric : str, default='euclidean'
25+
Metric to use during clustering, default evaluates to euclidean metric. For a list of available metrics,
26+
see the :data:`metric registry <deeptime.clustering.metrics>`.
27+
callback: callable or None
28+
used for kmeans++ initialization to indicate progress, called once per assigned center.
29+
seed : int, optional, default=-1
30+
The random seed. If non-negative, this fixes the random generator's seed and makes results reproducible.
31+
n_jobs : int, optional, default=None
32+
Number of jobs.
33+
34+
Returns
35+
-------
36+
centers : np.ndarray
37+
An (n_centers, dim)-shaped array with a kmeans++ cluster center initial guess.
38+
39+
References
40+
----------
41+
.. footbibliography::
42+
"""
43+
n_jobs = handle_n_jobs(n_jobs)
44+
metric = metrics[metric]()
45+
return _bd.kmeans.init_centers_kmpp(data, k=n_clusters, random_seed=seed, n_threads=n_jobs,
46+
callback=callback, metric=metric)
47+
48+
1649
class KMeansModel(ClusterModel):
1750
r"""The K-means clustering model. Stores all important information which are result of the estimation procedure.
1851
It can also be used to transform data by assigning each frame to its closest cluster center. For an example
@@ -359,9 +392,8 @@ def _pick_initial_centers(self, data, strategy, n_jobs, callback=None):
359392
if strategy == 'uniform':
360393
return data[self.random_state.randint(0, len(data), size=self.n_clusters)]
361394
elif self.init_strategy == 'kmeans++':
362-
metric = metrics[self.metric]()
363-
return _bd.kmeans.init_centers_kmpp(data, k=self.n_clusters, random_seed=self.fixed_seed, n_threads=n_jobs,
364-
callback=callback, metric=metric)
395+
return kmeans_plusplus(data, self.n_clusters, self.metric,
396+
callback=callback, seed=self.fixed_seed, n_jobs=n_jobs)
365397

366398
def fit(self, data, initial_centers=None, callback_init_centers=None, callback_loop=None, n_jobs=None):
367399
""" Perform the clustering.

deeptime/covariance/util/_moments.py

+1-9
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
"""
2-
32
Data Types
43
----------
54
The standard data type for covariance computations is
@@ -75,20 +74,13 @@
7574
__author__ = 'noe'
7675

7776
import math
78-
import numbers
7977
import numpy as np
8078
from .covar_c import covartools
8179

8280

8381
def _is_zero(x):
8482
""" Returns True if x is numerically 0 or an array with 0's. """
85-
if x is None:
86-
return True
87-
if isinstance(x, numbers.Number):
88-
return x == 0.0
89-
if isinstance(x, np.ndarray):
90-
return np.all(x == 0)
91-
return False
83+
return x is None or np.all(np.asarray(x) == 0)
9284

9385

9486
def _sparsify(X, remove_mean=False, modify_data=False, sparse_mode='auto', sparse_tol=0.0):

deeptime/covariance/util/_running_moments.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,6 @@ def covar(self, bessels_correction=True):
8484

8585

8686
class MomentsStorage:
87-
"""
88-
"""
8987

9088
def __init__(self, nsave, remove_mean=False, rtol=1.5):
9189
"""
@@ -196,7 +194,8 @@ def __init__(self, compute_XX=True, compute_XY=False, compute_YY=False,
196194
warnings.warn('symmetrize=True has no effect with compute_XY=False.')
197195
if diag_only and sparse_mode != 'dense':
198196
if sparse_mode == 'sparse':
199-
warnings.warn('Computing diagonal entries only is not implemented for sparse mode. Switching to dense mode.')
197+
warnings.warn('Computing diagonal entries only is not implemented for sparse mode. '
198+
'Switching to dense mode.')
200199
sparse_mode = 'dense'
201200
# storage
202201
self.compute_XX = compute_XX
@@ -253,9 +252,10 @@ def add(self, X, Y=None, weights=None, column_selection=None):
253252
# Check appropriate length if weights is an array:
254253
elif isinstance(weights, np.ndarray):
255254
if len(weights) != T:
256-
raise ValueError('weights and X must have equal length. Was {} and {} respectively.'.format(len(weights), len(X)))
255+
raise ValueError(f'Weights and X must have equal length. '
256+
f'Was {len(weights)} and {len(X)}, respectively.')
257257
else:
258-
raise TypeError('weights is of type %s, must be a number or ndarray' % (type(weights)))
258+
raise TypeError(f'Weights is of type {type(weights)}, must be a number or ndarray.')
259259
# estimate and add to storage
260260
if self.compute_XX and not self.compute_XY and not self.compute_YY:
261261
w, s_X, C_XX = moments_XX(X, remove_mean=self.remove_mean, weights=weights, sparse_mode=self.sparse_mode,

deeptime/covariance/util/covar_c/covartools.cpp

+8-5
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,12 @@ PYBIND11_MODULE(_covartools, m) {
1111
// ================================================
1212
// Check for constant columns
1313
// ================================================
14-
m.def("variable_cols_char", &_variable_cols<char>);
15-
m.def("variable_cols_int", &_variable_cols<int>);
16-
m.def("variable_cols_long", &_variable_cols<long>);
17-
m.def("variable_cols_float", &_variable_cols<float>);
18-
m.def("variable_cols_double", &_variable_cols<double>);
14+
m.def("variable_cols", &_variable_cols<char>);
15+
m.def("variable_cols", &_variable_cols<bool>);
16+
m.def("variable_cols", &_variable_cols<int>);
17+
m.def("variable_cols", &_variable_cols<long>);
18+
m.def("variable_cols", &_variable_cols<float>);
19+
m.def("variable_cols", &_variable_cols<double>);
20+
m.def("variable_cols", &_variable_cols<std::complex<float>>);
21+
m.def("variable_cols", &_variable_cols<std::complex<double>>);
1922
}

deeptime/covariance/util/covar_c/covartools.hpp

+12-13
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
#pragma once
22

33
#include <cstdlib>
4+
#include <complex>
45
#include <pybind11/pybind11.h>
56
#include <pybind11/numpy.h>
67

8+
#include "common.h"
9+
710
namespace py = pybind11;
811

912

@@ -16,30 +19,26 @@ namespace py = pybind11;
1619
1720
*/
1821
template<typename dtype>
19-
int _variable_cols(py::array_t<bool, py::array::c_style> &np_cols,
20-
const py::array_t<dtype, py::array::c_style> &np_X,
22+
int _variable_cols(np_array_nfc<bool> &np_cols,
23+
const np_array_nfc<dtype> &np_X,
2124
float tol=0, std::size_t min_constant=0) {
2225
// compare first and last row to get constant candidates
23-
std::size_t i, j;
24-
std::size_t ro;
2526
std::size_t M = static_cast<std::size_t>(np_X.shape(0)), N = static_cast<std::size_t>(np_X.shape(1));
26-
dtype diff;
2727
std::size_t nconstant = N; // current number of constant columns
2828
auto cols = np_cols.mutable_data(0);
2929
auto X = np_X.data(0);
3030
// by default all 0 (constant)
31-
for (j = 0; j < N; j++)
31+
for (std::size_t j = 0; j < N; j++)
3232
cols[j] = false;
3333

3434
// go through all rows in order to confirm constant candidates
35-
for (i = 0; i < M; i++) {
36-
ro = i * N;
37-
for (j = 0; j < N; j++) {
35+
for (std::size_t i = 0; i < M; i++) {
36+
auto ro = i * N;
37+
for (std::size_t j = 0; j < N; j++) {
3838
if (! cols[j]) {
39-
// note: the compiler will eliminate this branch, if dtype != (float, double)
40-
if (std::is_floating_point<dtype>::value) {
41-
diff = std::abs(X[j] - X[ro + j]);
42-
if (diff >= tol) {
39+
if constexpr (std::is_floating_point<dtype>::value) {
40+
auto diff = std::abs(X[j] - X[ro + j]);
41+
if (diff > tol) {
4342
cols[j] = true;
4443
nconstant--;
4544
// are constant columns below threshold? Then interrupt.
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
import numpy as np
22

33

4-
def variable_cols(X: np.ndarray, tol=0.0, min_constant=0):
4+
def variable_cols(data: np.ndarray, tol=0.0, min_constant=0):
55
""" Evaluates which columns are constant (0) or variable (1)
66
77
Parameters
88
----------
9-
X : ndarray
9+
data : ndarray
1010
Matrix whose columns will be checked for constant or variable.
1111
tol : float
1212
Tolerance for float-matrices. When set to 0 only equal columns with
@@ -26,31 +26,10 @@ def variable_cols(X: np.ndarray, tol=0.0, min_constant=0):
2626
variable / non-constant. False: column is constant.
2727
2828
"""
29-
if X is None:
30-
return None
31-
from ._covartools import (variable_cols_double,
32-
variable_cols_float,
33-
variable_cols_int,
34-
variable_cols_long,
35-
variable_cols_char)
29+
from ._covartools import variable_cols as impl
3630
# prepare column array
37-
cols = np.zeros(X.shape[1], dtype=bool, order='C')
38-
39-
if X.dtype == np.float64:
40-
completed = variable_cols_double(cols, X, tol, min_constant)
41-
elif X.dtype == np.float32:
42-
completed = variable_cols_float(cols, X, tol, min_constant)
43-
elif X.dtype == np.int32:
44-
completed = variable_cols_int(cols, X, 0, min_constant)
45-
elif X.dtype == np.int64:
46-
completed = variable_cols_long(cols, X, 0, min_constant)
47-
elif X.dtype == np.bool_:
48-
completed = variable_cols_char(cols, X, 0, min_constant)
49-
else:
50-
raise TypeError('unsupported type of X: %s' % X.dtype)
31+
cols = np.zeros(data.shape[1], dtype=bool, order='C')
32+
completed = impl(cols, data, tol, min_constant)
5133

5234
# if interrupted, return all ones. Otherwise return the variable columns as bool array
53-
if completed == 0:
54-
return np.ones_like(cols, dtype=bool)
55-
56-
return cols
35+
return cols if completed == 1 else np.ones_like(cols, dtype=bool)

0 commit comments

Comments
 (0)