deeptime-ml
diff --git a/‎deeptime/clustering/__init__.py
+17-1 b/‎deeptime/clustering/__init__.py
+17-1
diff --git a/‎deeptime/clustering/_box.py
+111 b/‎deeptime/clustering/_box.py
+111
diff --git a/‎deeptime/clustering/_kmeans.py
+37-5 b/‎deeptime/clustering/_kmeans.py
+37-5
diff --git a/‎deeptime/covariance/util/_moments.py
+1-9 b/‎deeptime/covariance/util/_moments.py
+1-9
diff --git a/‎deeptime/covariance/util/_running_moments.py
+5-5 b/‎deeptime/covariance/util/_running_moments.py
+5-5
diff --git a/‎deeptime/covariance/util/covar_c/covartools.cpp
+8-5 b/‎deeptime/covariance/util/covar_c/covartools.cpp
+8-5
diff --git a/‎deeptime/covariance/util/covar_c/covartools.hpp
+12-13 b/‎deeptime/covariance/util/covar_c/covartools.hpp
+12-13
diff --git a/‎deeptime/covariance/util/covar_c/covartools.py
+6-27 b/‎deeptime/covariance/util/covar_c/covartools.py
+6-27
@@ -12,6 +12,7 @@
     KMeans
     MiniBatchKMeans
     RegularSpace
+    BoxDiscretization
 
 
 ===============================================================================
@@ -24,6 +25,18 @@
 
     ClusterModel
     KMeansModel
+    BoxDiscretizationModel
+
+
+===============================================================================
+Functions
+===============================================================================
+
+.. autosummary::
+    :toctree: generated/
+    :template: class_nomodule.rst
+
+    kmeans_plusplus
 
 
 ===============================================================================
@@ -34,7 +47,7 @@
     :toctree: generated/
     :template: class_nomodule.rst
 
-    _clustering_bindings.Metric
+    Metric
     metrics
     MetricRegistry
 """
@@ -43,4 +56,7 @@
 from ._clustering_bindings import Metric
 from ._kmeans import KMeans, MiniBatchKMeans, KMeansModel
 from ._regspace import RegularSpace
+from ._box import BoxDiscretization, BoxDiscretizationModel
 from ._cluster_model import ClusterModel
+
+from ._kmeans import kmeans_plusplus
@@ -0,0 +1,111 @@
+from typing import Optional
+
+import numpy as np
+
+from ._cluster_model import ClusterModel
+from ..base import Estimator
+
+
+class BoxDiscretizationModel(ClusterModel):
+    r""" Model produced by :class:`BoxDiscretization`. Can be used to discretize and/or one-hot transform data.
+
+    Parameters
+    ----------
+    cluster_centers : ndarray
+        The cluster centers.
+    v0 : ndarray
+        Lower left vertex of box.
+    v1 : ndarray
+        Upper right vertex of box.
+    n_boxes : int
+        Number of boxes.
+    """
+
+    def __init__(self, cluster_centers: np.ndarray, v0, v1, n_boxes):
+        super().__init__(cluster_centers)
+        self.v0 = v0
+        self.v1 = v1
+        self.n_boxes = n_boxes
+
+    def transform_onehot(self, data, n_jobs=None):
+        r"""Transforms data into discrete states with subsequent one-hot encoding.
+
+        Parameters
+        ----------
+        data : ndarray
+            Input data
+        n_jobs : int or None, optional, default=None
+            Number of jobs.
+
+        Returns
+        -------
+        one_hot : ndarray
+            A (T, n_boxes) shaped array with one-hot encoded data.
+        """
+        dtraj = self.transform(data, n_jobs=n_jobs)
+        traj_onehot = np.zeros((len(data), self.n_clusters))
+        traj_onehot[np.arange(len(data)), dtraj] = 1.
+        return traj_onehot
+
+
+class BoxDiscretization(Estimator):
+    r"""An n-dimensional box discretization of Euclidean space.
+
+    It spans an n-dimensional grid based on linspaces along each axis which is then used as cluster centers.
+    The linspaces are bounded either by the user (attributes :attr:`v0` and :attr:`v1`) or estimated from data.
+
+    Parameters
+    ----------
+    dim : int
+        Dimension of the box discretization.
+    n_boxes : int or list of int
+        Number of boxes per dimension of - if given as single integer - for all dimensions.
+    v0 : array or None, optional, default=None
+        Lower left vertex of the box discretization. If not given this is estimated from data.
+    v1 : array or None, optional, default=None
+        Upper right vertex of the box discretization. If not given this is estimated from data.
+    """
+
+    def __init__(self, dim: int, n_boxes, v0=None, v1=None):
+        super().__init__()
+        if not isinstance(n_boxes, (list, tuple, np.ndarray)):
+            if int(n_boxes) == n_boxes:
+                n_boxes = [int(n_boxes)] * dim
+        if len(n_boxes) != dim:
+            raise ValueError(f"Dimension and number of boxes per dimension did not match ({len(n_boxes)} and {dim}).")
+        if v0 is not None and len(v0) != dim:
+            raise ValueError("Length of v0 did not match dimension.")
+        if v1 is not None and len(v1) != dim:
+            raise ValueError("Length of v1 did not match dimension.")
+        self.dim = dim
+        self.n_boxes = n_boxes
+        self.v0 = v0
+        self.v1 = v1
+
+    def fit(self, data: np.ndarray, **kwargs):
+        assert data.shape[1] == self.dim
+        if self.v0 is None or self.v1 is None:
+            v0 = np.empty((self.dim,), dtype=data.dtype) if self.v0 is None else self.v0
+            v1 = np.empty((self.dim,), dtype=data.dtype) if self.v1 is None else self.v1
+            for d in range(self.dim):
+                if self.v0 is None:
+                    v0[d] = np.min(data[:, d])
+                if self.v1 is None:
+                    v1[d] = np.max(data[:, d])
+        else:
+            v0 = self.v0
+            v1 = self.v1
+        linspaces = [np.linspace(v0[d], v1[d], num=self.n_boxes[d], endpoint=True) for d in range(self.dim)]
+        mesh = np.vstack(np.meshgrid(*tuple(linspaces))).reshape(self.dim, -1).T
+        self._model = BoxDiscretizationModel(mesh, v0, v1, self.n_boxes)
+        return self
+
+    def fetch_model(self) -> Optional[BoxDiscretizationModel]:
+        r""" Yields the estimated model.
+
+        Returns
+        -------
+        model : BoxDiscretizationModel or None
+            The model.
+        """
+        return super().fetch_model()
@@ -8,11 +8,44 @@
 from ._cluster_model import ClusterModel
 from . import _clustering_bindings as _bd, metrics
 
-__all__ = ['KMeans', 'MiniBatchKMeans', 'KMeansModel']
-
 from ..util.parallel import handle_n_jobs
 
 
+def kmeans_plusplus(data, n_clusters: int, metric: str = 'euclidean', callback=None, seed: int = -1,
+                    n_jobs: Optional[int] = None):
+    r""" Performs kmeans++ initialization. :footcite:`arthur2006k`
+
+    Parameters
+    ----------
+    data : np.ndarray
+        Input data of shape (T, n_dim).
+    n_clusters : int
+        The number of cluster centers.
+    metric : str, default='euclidean'
+        Metric to use during clustering, default evaluates to euclidean metric. For a list of available metrics,
+        see the :data:`metric registry <deeptime.clustering.metrics>`.
+    callback: callable or None
+        used for kmeans++ initialization to indicate progress, called once per assigned center.
+    seed : int, optional, default=-1
+        The random seed. If non-negative, this fixes the random generator's seed and makes results reproducible.
+    n_jobs : int, optional, default=None
+        Number of jobs.
+
+    Returns
+    -------
+    centers : np.ndarray
+        An (n_centers, dim)-shaped array with a kmeans++ cluster center initial guess.
+
+    References
+    ----------
+    .. footbibliography::
+    """
+    n_jobs = handle_n_jobs(n_jobs)
+    metric = metrics[metric]()
+    return _bd.kmeans.init_centers_kmpp(data, k=n_clusters, random_seed=seed, n_threads=n_jobs,
+                                        callback=callback, metric=metric)
+
+
 class KMeansModel(ClusterModel):
     r"""The K-means clustering model. Stores all important information which are result of the estimation procedure.
     It can also be used to transform data by assigning each frame to its closest cluster center. For an example
@@ -359,9 +392,8 @@ def _pick_initial_centers(self, data, strategy, n_jobs, callback=None):
         if strategy == 'uniform':
             return data[self.random_state.randint(0, len(data), size=self.n_clusters)]
         elif self.init_strategy == 'kmeans++':
-            metric = metrics[self.metric]()
-            return _bd.kmeans.init_centers_kmpp(data, k=self.n_clusters, random_seed=self.fixed_seed, n_threads=n_jobs,
-                                                callback=callback, metric=metric)
+            return kmeans_plusplus(data, self.n_clusters, self.metric,
+                                   callback=callback, seed=self.fixed_seed, n_jobs=n_jobs)
 
     def fit(self, data, initial_centers=None, callback_init_centers=None, callback_loop=None, n_jobs=None):
         """ Perform the clustering.
 
@@ -1,5 +1,4 @@
 """
-
 Data Types
 ----------
 The standard data type for covariance computations is
@@ -75,20 +74,13 @@
 __author__ = 'noe'
 
 import math
-import numbers
 import numpy as np
 from .covar_c import covartools
 
 
 def _is_zero(x):
     """ Returns True if x is numerically 0 or an array with 0's. """
-    if x is None:
-        return True
-    if isinstance(x, numbers.Number):
-        return x == 0.0
-    if isinstance(x, np.ndarray):
-        return np.all(x == 0)
-    return False
+    return x is None or np.all(np.asarray(x) == 0)
 
 
 def _sparsify(X, remove_mean=False, modify_data=False, sparse_mode='auto', sparse_tol=0.0):
 
@@ -84,8 +84,6 @@ def covar(self, bessels_correction=True):
 
 
 class MomentsStorage:
-    """
-    """
 
     def __init__(self, nsave, remove_mean=False, rtol=1.5):
         """
@@ -196,7 +194,8 @@ def __init__(self, compute_XX=True, compute_XY=False, compute_YY=False,
             warnings.warn('symmetrize=True has no effect with compute_XY=False.')
         if diag_only and sparse_mode != 'dense':
             if sparse_mode == 'sparse':
-                warnings.warn('Computing diagonal entries only is not implemented for sparse mode. Switching to dense mode.')
+                warnings.warn('Computing diagonal entries only is not implemented for sparse mode. '
+                              'Switching to dense mode.')
             sparse_mode = 'dense'
         # storage
         self.compute_XX = compute_XX
@@ -253,9 +252,10 @@ def add(self, X, Y=None, weights=None, column_selection=None):
             # Check appropriate length if weights is an array:
             elif isinstance(weights, np.ndarray):
                 if len(weights) != T:
-                    raise ValueError('weights and X must have equal length. Was {} and {} respectively.'.format(len(weights), len(X)))
+                    raise ValueError(f'Weights and X must have equal length. '
+                                     f'Was {len(weights)} and {len(X)}, respectively.')
             else:
-                raise TypeError('weights is of type %s, must be a number or ndarray' % (type(weights)))
+                raise TypeError(f'Weights is of type {type(weights)}, must be a number or ndarray.')
         # estimate and add to storage
         if self.compute_XX and not self.compute_XY and not self.compute_YY:
             w, s_X, C_XX = moments_XX(X, remove_mean=self.remove_mean, weights=weights, sparse_mode=self.sparse_mode,
 
@@ -11,9 +11,12 @@ PYBIND11_MODULE(_covartools, m) {
     // ================================================
     // Check for constant columns
     // ================================================
-    m.def("variable_cols_char", &_variable_cols<char>);
-    m.def("variable_cols_int", &_variable_cols<int>);
-    m.def("variable_cols_long", &_variable_cols<long>);
-    m.def("variable_cols_float", &_variable_cols<float>);
-    m.def("variable_cols_double", &_variable_cols<double>);
+    m.def("variable_cols", &_variable_cols<char>);
+    m.def("variable_cols", &_variable_cols<bool>);
+    m.def("variable_cols", &_variable_cols<int>);
+    m.def("variable_cols", &_variable_cols<long>);
+    m.def("variable_cols", &_variable_cols<float>);
+    m.def("variable_cols", &_variable_cols<double>);
+    m.def("variable_cols", &_variable_cols<std::complex<float>>);
+    m.def("variable_cols", &_variable_cols<std::complex<double>>);
 }
@@ -1,9 +1,12 @@
 #pragma once
 
 #include <cstdlib>
+#include <complex>
 #include <pybind11/pybind11.h>
 #include <pybind11/numpy.h>
 
+#include "common.h"
+
 namespace py = pybind11;
 
 
@@ -16,30 +19,26 @@ namespace py = pybind11;
 
 */
 template<typename dtype>
-int _variable_cols(py::array_t<bool, py::array::c_style> &np_cols,
-                   const py::array_t<dtype, py::array::c_style> &np_X,
+int _variable_cols(np_array_nfc<bool> &np_cols,
+                   const np_array_nfc<dtype> &np_X,
                    float tol=0, std::size_t min_constant=0) {
     // compare first and last row to get constant candidates
-    std::size_t i, j;
-    std::size_t ro;
     std::size_t M = static_cast<std::size_t>(np_X.shape(0)), N = static_cast<std::size_t>(np_X.shape(1));
-    dtype diff;
     std::size_t nconstant = N;  // current number of constant columns
     auto cols = np_cols.mutable_data(0);
     auto X = np_X.data(0);
     // by default all 0 (constant)
-    for (j = 0; j < N; j++)
+    for (std::size_t j = 0; j < N; j++)
         cols[j] = false;
 
     // go through all rows in order to confirm constant candidates
-    for (i = 0; i < M; i++) {
-        ro = i * N;
-        for (j = 0; j < N; j++) {
+    for (std::size_t i = 0; i < M; i++) {
+        auto ro = i * N;
+        for (std::size_t j = 0; j < N; j++) {
             if (! cols[j]) {
-                // note: the compiler will eliminate this branch, if dtype != (float, double)
-                if (std::is_floating_point<dtype>::value) {
-                    diff = std::abs(X[j] - X[ro + j]);
-                    if (diff >= tol) {
+                if constexpr (std::is_floating_point<dtype>::value) {
+                    auto diff = std::abs(X[j] - X[ro + j]);
+                    if (diff > tol) {
                         cols[j] = true;
                         nconstant--;
                         // are constant columns below threshold? Then interrupt.
 
@@ -1,12 +1,12 @@
 import numpy as np
 
 
-def variable_cols(X: np.ndarray, tol=0.0, min_constant=0):
+def variable_cols(data: np.ndarray, tol=0.0, min_constant=0):
     """ Evaluates which columns are constant (0) or variable (1)
 
     Parameters
     ----------
-    X : ndarray
+    data : ndarray
         Matrix whose columns will be checked for constant or variable.
     tol : float
         Tolerance for float-matrices. When set to 0 only equal columns with
@@ -26,31 +26,10 @@ def variable_cols(X: np.ndarray, tol=0.0, min_constant=0):
         variable / non-constant. False: column is constant.
 
     """
-    if X is None:
-        return None
-    from ._covartools import (variable_cols_double,
-                              variable_cols_float,
-                              variable_cols_int,
-                              variable_cols_long,
-                              variable_cols_char)
+    from ._covartools import variable_cols as impl
     # prepare column array
-    cols = np.zeros(X.shape[1], dtype=bool, order='C')
-
-    if X.dtype == np.float64:
-        completed = variable_cols_double(cols, X, tol, min_constant)
-    elif X.dtype == np.float32:
-        completed = variable_cols_float(cols, X, tol, min_constant)
-    elif X.dtype == np.int32:
-        completed = variable_cols_int(cols, X, 0, min_constant)
-    elif X.dtype == np.int64:
-        completed = variable_cols_long(cols, X, 0, min_constant)
-    elif X.dtype == np.bool_:
-        completed = variable_cols_char(cols, X, 0, min_constant)
-    else:
-        raise TypeError('unsupported type of X: %s' % X.dtype)
+    cols = np.zeros(data.shape[1], dtype=bool, order='C')
+    completed = impl(cols, data, tol, min_constant)
 
     # if interrupted, return all ones. Otherwise return the variable columns as bool array
-    if completed == 0:
-        return np.ones_like(cols, dtype=bool)
-
-    return cols
+    return cols if completed == 1 else np.ones_like(cols, dtype=bool)