Minor changes (#208)

clonker · web-flow · commit dc320dbddd00 · 2022-02-15T17:45:59.000+01:00
diff --git a/deeptime/markov/_base.py b/deeptime/markov/_base.py
@@ -139,7 +139,6 @@ def lagtime(self):
     def ck_test(self, models, n_metastable_sets, include_lag0=True, err_est=False, progress=None):
         r""" Performs a Chapman Kolmogorov test.
         See :meth:`MarkovStateModel.ck_test <deeptime.markov.msm.MarkovStateModel.ck_test>` for more details """
-        from deeptime.util.validation import ChapmanKolmogorovTest
         clustering = self.prior.pcca(n_metastable_sets)
         observable = MembershipsObservable(self, clustering, initial_distribution=self.prior.stationary_distribution)
         from deeptime.util.validation import ck_test
diff --git a/deeptime/markov/_observables.py b/deeptime/markov/_observables.py
@@ -1,3 +1,4 @@
+import numbers
 from typing import Union
 
 import numpy as np
@@ -53,7 +54,12 @@ def __call__(self, model, mlag=1, **kw):
             return np.eye(self.n_sets)
         model = MembershipsObservable._to_markov_model(model)
         # otherwise compute or predict them by model.propagate
-        pk_on_set = np.zeros((self.n_sets, self.n_sets), dtype=float if self.ignore_imaginary_parts else complex)
+        integer_lag = isinstance(mlag, numbers.Integral)
+        if self.ignore_imaginary_parts or (model.is_real and integer_lag and np.all(np.isreal(self.P0))):
+            dtype = float
+        else:
+            dtype = complex
+        pk_on_set = np.zeros((self.n_sets, self.n_sets), dtype=dtype)
         # compute observable on prior in case for Bayesian models.
         symbols = model.count_model.state_symbols
         subset = self._full2active[symbols]  # find subset we are now working on
diff --git a/deeptime/markov/msm/_bayesian_msm.py b/deeptime/markov/msm/_bayesian_msm.py
@@ -1,8 +1,9 @@
 from math import sqrt
-from typing import Optional, Callable
+from typing import Optional, Callable, Union, List
 
 import numpy as np
 
+from .._transition_counting import TransitionCountEstimator
 from ...base import Estimator
 from ...numeric import is_square_matrix
 from .._base import _MSMBaseEstimator, BayesianMSMPosterior
@@ -37,17 +38,17 @@ class BayesianMSM(_MSMBaseEstimator):
         this case python sparse matrices will be returned by the corresponding functions instead of numpy arrays.
         This behavior is suggested for very large numbers of states (e.g. > 4000) because it is likely to be much
         more efficient.
-    confidence : float, optional, default=0.954
-        Confidence interval. By default two sigma (95.4%) is used. Use 68.3% for one sigma, 99.7% for three sigma.
     maxiter : int, optional, default=1000000
         Optional parameter with reversible = True, sets the maximum number of iterations before the transition
         matrix estimation method exits.
-    maxerr : float, optional, default = 1e-8
+    maxerr : float, optional, default=1e-8
         Optional parameter with reversible = True. Convergence tolerance for transition matrix estimation. This
         specifies the maximum change of the Euclidean norm of relative stationary probabilities
         (:math:`x_i = \sum_k x_{ik}`). The relative stationary probability changes
         :math:`e_i = (x_i^{(1)} - x_i^{(2)})/(x_i^{(1)} + x_i^{(2)})` are used in order to track changes in small
         probabilities. The Euclidean norm of the change vector, :math:`|e_i|_2`, is compared to maxerr.
+    lagtime : int, optional, default=None
+        The lagtime that is used when fitting directly from discrete trajectories.
 
     References
     ----------
@@ -130,14 +131,14 @@ class BayesianMSM(_MSMBaseEstimator):
 
     def __init__(self, n_samples: int = 100, n_steps: int = None, reversible: bool = True,
                  stationary_distribution_constraint: Optional[np.ndarray] = None,
-                 sparse: bool = False, confidence: float = 0.954, maxiter: int = int(1e6), maxerr: float = 1e-8):
+                 sparse: bool = False, maxiter: int = int(1e6), maxerr: float = 1e-8, lagtime: Optional[int] = None):
         super(BayesianMSM, self).__init__(reversible=reversible, sparse=sparse)
         self.stationary_distribution_constraint = stationary_distribution_constraint
         self.maxiter = maxiter
         self.maxerr = maxerr
         self.n_samples = n_samples
         self.n_steps = n_steps
-        self.confidence = confidence
+        self.lagtime = lagtime
 
     @property
     def stationary_distribution_constraint(self) -> Optional[np.ndarray]:
@@ -203,18 +204,13 @@ def fit(self, data, callback: Callable = None, **kw):
 
         from deeptime.markov import TransitionCountModel
         if isinstance(data, TransitionCountModel) or is_square_matrix(data):
-            msm = MaximumLikelihoodMSM(
-                reversible=self.reversible, stationary_distribution_constraint=self.stationary_distribution_constraint,
-                sparse=self.sparse, maxiter=self.maxiter, maxerr=self.maxerr
-            ).fit(data).fetch_model()
+            return self.fit_from_counts(data)
         elif isinstance(data, MarkovStateModel):
-            msm = data
+            return self.fit_from_msm(data, callback=callback, **kw)
         else:
-            raise ValueError("Unsupported input data, can only be count matrix (or TransitionCountModel, "
-                             "TransitionCountEstimator) or a MarkovStateModel instance or an estimator producing "
-                             "Markov state models.")
-
-        return self.fit_from_msm(msm, callback=callback, **kw)
+            if not self.lagtime and 'lagtime' not in kw.keys():
+                raise ValueError("To fit directly from a discrete timeseries, a lagtime must be provided!")
+            return self.fit_from_discrete_timeseries(data, kw.pop('lagtime', self.lagtime), callback=callback, **kw)
 
     def sample(self, prior: MarkovStateModel, n_samples: int, n_steps: Optional[int] = None, callback=None):
         r""" Performs sampling based on a prior.
@@ -310,6 +306,55 @@ def fit_from_msm(self, msm: MarkovStateModel, callback=None, **kw):
         self._model = BayesianMSMPosterior(prior=msm, samples=samples)
         return self
 
+    def fit_from_discrete_timeseries(self, discrete_timeseries: Union[np.ndarray, List[np.ndarray]],
+                                     lagtime: int = None, count_mode: str = 'effective', callback=None, **kw):
+        r""" Fits a BayesianMSM directly on timeseries data.
+
+        Parameters
+        ----------
+        discrete_timeseries : list of ndarray
+            Discrete trajectories.
+        lagtime : int, optional, default=None
+            The lagtime that is used for estimation. If None, uses the instance's lagtime attribute.
+        count_mode : str, default='effective'
+            The counting mode. Should be of the `effective` kind, otherwise the results may be heavily biased.
+        callback : callable, optional, default=None
+            Function to be called to indicate progress of sampling.
+        **kw
+            Optional keyword parameters.
+
+        Returns
+        -------
+        self : BayesianMSM
+            Reference to self.
+        """
+        counts = TransitionCountEstimator(lagtime=lagtime, count_mode=count_mode,
+                                          sparse=self.sparse).fit_fetch(discrete_timeseries).submodel_largest()
+        return self.fit_from_counts(counts, callback=callback, **kw)
+
+    def fit_from_counts(self, counts, callback=None, **kw):
+        r"""Fits a bayesian MSM on a count model or a count matrix.
+
+        Parameters
+        ----------
+        counts : TransitionCountModel or (n, n) ndarray
+            The transition counts.
+        callback : callable, optional, default=None
+            Function that is called to indicate progress of sampling.
+        **kw
+            Optional keyword parameters.
+
+        Returns
+        -------
+        self : BayesianMSM
+            Reference to self.
+        """
+        msm = MaximumLikelihoodMSM(
+            reversible=self.reversible, stationary_distribution_constraint=self.stationary_distribution_constraint,
+            sparse=self.sparse, maxiter=self.maxiter, maxerr=self.maxerr
+        ).fit(counts).fetch_model()
+        return self.fit_from_msm(msm, callback=callback, **kw)
+
     @deprecated_method("Deprecated in v0.4.1 and will be removed soon, please use model.ck_test.")
     def chapman_kolmogorov_validator(self, n_metastable_sets: int, mlags, test_model=None):
         r""" Replaced by `deeptime.markov.msm.BayesianMSMPosterior.ck_test`. """
diff --git a/deeptime/markov/msm/_markov_state_model.py b/deeptime/markov/msm/_markov_state_model.py
@@ -360,7 +360,7 @@ def _ensure_eigenvalues(self, neig=None):
             if m < neig:
                 # not enough eigenpairs present - recompute:
                 self._compute_eigenvalues(neig)
-        except (AttributeError, TypeError) as e:
+        except (AttributeError, TypeError):
             # no eigendecomposition yet - compute:
             self._compute_eigenvalues(neig)
 
@@ -530,6 +530,12 @@ def _transition_matrix_power(self, power):
                 ])
         return transition_matrix
 
+    @cached_property
+    def is_real(self):
+        r""" Checks if all eigenvalues as well as eigenvectors/functions are real. """
+        return np.all(np.isreal(self.eigenvalues())) and \
+               np.all(np.isreal(self.eigenvectors_left()) & np.isreal(self.eigenvectors_right()))
+
     def propagate(self, p0, k: int):
         r""" Propagates the initial distribution p0 k times
 
diff --git a/deeptime/markov/msm/_maximum_likelihood_msm.py b/deeptime/markov/msm/_maximum_likelihood_msm.py
@@ -56,6 +56,9 @@ class MaximumLikelihoodMSM(_MSMBaseEstimator):
         Number of counts required to consider two states connected.
     lagtime : int, optional, default=None
         Optional lagtime that can be provided at estimator level if fitting from timeseries directly.
+    use_lcc : bool, default=False
+        If set to true, this will restrict the resulting MSM collection to only contain the largest connected
+        state-space component.
 
     References
     ----------
@@ -64,7 +67,8 @@ class MaximumLikelihoodMSM(_MSMBaseEstimator):
 
     def __init__(self, reversible: bool = True, stationary_distribution_constraint: Optional[np.ndarray] = None,
                  sparse: bool = False, allow_disconnected: bool = False, maxiter: int = int(1e6), maxerr: float = 1e-8,
-                 connectivity_threshold: float = 0, transition_matrix_tolerance: float = 1e-6, lagtime=None):
+                 connectivity_threshold: float = 0, transition_matrix_tolerance: float = 1e-6, lagtime=None,
+                 use_lcc: bool=False):
         super(MaximumLikelihoodMSM, self).__init__(reversible=reversible, sparse=sparse)
 
         self.stationary_distribution_constraint = stationary_distribution_constraint
@@ -74,6 +78,7 @@ def __init__(self, reversible: bool = True, stationary_distribution_constraint:
         self.connectivity_threshold = connectivity_threshold
         self.transition_matrix_tolerance = transition_matrix_tolerance
         self.lagtime = lagtime
+        self.use_lcc = use_lcc
 
     @property
     def allow_disconnected(self) -> bool:
@@ -226,6 +231,8 @@ def fit_from_counts(self, counts: Union[np.ndarray, TransitionCountEstimator, Tr
         transition_matrices = []
         statdists = []
         count_models = []
+        if self.use_lcc:
+            sets = [sets[0]]
         for subset in sets:
             try:
                 sub_counts = counts.submodel(subset)
diff --git a/deeptime/markov/tools/analysis/_assessment.py b/deeptime/markov/tools/analysis/_assessment.py
@@ -105,7 +105,7 @@ def is_reversible(T, mu=None, tol=1e-15):
         mu = stationary_distribution(T)
 
     if sparse.issparse(T):
-        prod = sparse.construct.diags(mu) * T
+        prod = sparse.diags(mu) * T
     else:
         prod = mu[:, None] * T
 
diff --git a/deeptime/markov/tools/analysis/dense/_correlations.py b/deeptime/markov/tools/analysis/dense/_correlations.py
@@ -233,7 +233,7 @@ def time_relaxation_direct_by_diagonalization(P, p0, obs, time, rdl=None):
     return result
 
 
-def time_relaxations_direct(P, p0, obs, times=[1]):
+def time_relaxations_direct(P, p0, obs, times=(1,)):
     r"""Compute time-relaxations of obs with respect of given initial distribution.
 
     relaxation(k) = p0 P^k obs
diff --git a/deeptime/markov/tools/estimation/api.py b/deeptime/markov/tools/estimation/api.py
@@ -13,7 +13,6 @@
 from scipy.sparse import coo_matrix
 from scipy.sparse import csr_matrix
 from scipy.sparse import issparse
-from scipy.sparse.sputils import isdense
 
 from deeptime.util.types import ensure_dtraj_list
 from . import dense
@@ -378,10 +377,7 @@ def connected_sets(C, directed=True):
     [array([0, 1, 2])]
 
     """
-    if isdense(C):
-        return sparse.connectivity.connected_sets(csr_matrix(C), directed=directed)
-    else:
-        return sparse.connectivity.connected_sets(C, directed=directed)
+    return sparse.connectivity.connected_sets(C if issparse(C) else csr_matrix(C), directed=directed)
 
 
 def largest_connected_set(C, directed=True):
@@ -432,10 +428,7 @@ def largest_connected_set(C, directed=True):
     array([0, 1, 2])
 
     """
-    if isdense(C):
-        return sparse.connectivity.largest_connected_set(csr_matrix(C), directed=directed)
-    else:
-        return sparse.connectivity.largest_connected_set(C, directed=directed)
+    return sparse.connectivity.largest_connected_set(C if issparse(C) else csr_matrix(C), directed=directed)
 
 
 def largest_connected_submatrix(C, directed=True, lcc=None):
@@ -492,10 +485,9 @@ def largest_connected_submatrix(C, directed=True, lcc=None):
            [ 0,  0,  4]]...)
 
     """
-    if isdense(C):
-        return sparse.connectivity.largest_connected_submatrix(csr_matrix(C), directed=directed, lcc=lcc).toarray()
-    else:
-        return sparse.connectivity.largest_connected_submatrix(C, directed=directed, lcc=lcc)
+    lcc = sparse.connectivity.largest_connected_submatrix(C if issparse(C) else csr_matrix(C),
+                                                          directed=directed, lcc=lcc)
+    return lcc if issparse(C) else lcc.toarray()
 
 
 def is_connected(C, directed=True):
@@ -542,10 +534,7 @@ def is_connected(C, directed=True):
     True
 
     """
-    if isdense(C):
-        return sparse.connectivity.is_connected(csr_matrix(C), directed=directed)
-    else:
-        return sparse.connectivity.is_connected(C, directed=directed)
+    return sparse.connectivity.is_connected(C if issparse(C) else csr_matrix(C), directed=directed)
 
 
 ################################################################################
@@ -591,7 +580,7 @@ def prior_neighbor(C, alpha=0.001):
 
     """
 
-    if isdense(C):
+    if not issparse(C):
         B = sparse.prior.prior_neighbor(csr_matrix(C), alpha=alpha)
         return B.toarray()
     else:
@@ -633,7 +622,7 @@ def prior_const(C, alpha=0.001):
            [0.001, 0.001, 0.001]])
 
     """
-    if not isdense(C):
+    if issparse(C):
         warnings.warn("Prior will be a dense matrix for sparse input")
     return sparse.prior.prior_const(C, alpha=alpha)
 
@@ -690,11 +679,7 @@ def prior_rev(C, alpha=-1.0):
            [ 0.,  0., -1.]])
 
     """
-    if isdense(C):
-        return sparse.prior.prior_rev(C, alpha=alpha)
-    else:
-        # warnings.warn("Prior will be a dense matrix for sparse input")
-        return sparse.prior.prior_rev(C, alpha=alpha)
+    return sparse.prior.prior_rev(C, alpha=alpha)
 
 
 ################################################################################
@@ -803,7 +788,7 @@ def transition_matrix(C, reversible=False, mu=None, method='auto',
     """
     if issparse(C):
         sparse_input_type = True
-    elif isdense(C):
+    elif isinstance(C, np.ndarray):
         sparse_input_type = False
     else:
         raise NotImplementedError('C has an unknown type.')
diff --git a/deeptime/markov/tools/estimation/sparse/effective_counts.py b/deeptime/markov/tools/estimation/sparse/effective_counts.py
@@ -6,7 +6,6 @@
 
 import numpy as np
 import scipy.sparse
-from scipy.sparse.csr import csr_matrix
 
 from threadpoolctl import threadpool_limits
 
@@ -215,7 +214,7 @@ def statistical_inefficiencies(dtrajs, lag, C=None, truncate_acf=True, mact=2.0,
                                                    truncate_acf=truncate_acf, mact=mact)
             if callback is not None:
                 callback(1)
-    res = csr_matrix((data, (I, J)), shape=C.shape)
+    res = scipy.sparse.csr_matrix((data, (I, J)), shape=C.shape)
     return res
 
 
diff --git a/deeptime/markov/tools/flux/api.py b/deeptime/markov/tools/flux/api.py
@@ -1,6 +1,5 @@
 import numpy as _np
-from scipy.sparse import csr_matrix
-from scipy.sparse.base import issparse
+from scipy.sparse import csr_matrix, issparse
 
 from deeptime.util.sparse import remove_negative_entries
 
diff --git a/deeptime/plots/chapman_kolmogorov.py b/deeptime/plots/chapman_kolmogorov.py
diff --git a/deeptime/util/validation.py b/deeptime/util/validation.py
diff --git a/tests/markov/msm/test_bayesian_msm.py b/tests/markov/msm/test_bayesian_msm.py
diff --git a/tests/plots/test_ck_test.py b/tests/plots/test_ck_test.py