diff --git a/README.md b/README.md index fce076b..605fc89 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ steps and indices to allow for fast re-computation with new parameters. Package documentation can be found at http://mmp2.github.io/megaman/ -If you use our software please cite the following JMLR paper: +If you use our software please cite the following JMLR paper: McQueen, Meila, VanderPlas, & Zhang, "Megaman: Scalable Manifold Learning in Python", Journal of Machine Learning Research, Vol 17 no. 14, 2016. @@ -64,10 +64,11 @@ To install megaman from source requires the following: Optional requirements include - [pyamg](http://pyamg.org/), which allows for faster decompositions of large matrices +- [pysamg](http://scai.fraunhofer.de/samg/), which allows for even faster decompositions of large matrices. This needs and is included in the commercial software Fraunhofer SAMG. For licensing (including test or educational licenses) contact samg@scai.fraunhofer.de - [pyflann](http://www.cs.ubc.ca/research/flann/) which offers another method of computing distance matrices (this is bundled with the FLANN source code) - [nose](https://nose.readthedocs.org/) for running the unit tests -These requirements can be installed on Linux and MacOSX using the following conda command: +These requirements(except for SAMG) can be installed on Linux and MacOSX using the following conda command: ``` $ conda install --channel=conda-forge pip nose coverage gcc cython numpy scipy scikit-learn pyflann pyamg @@ -91,7 +92,7 @@ to run the unit tests. ``megaman`` is tested on Python versions 2.7, 3.4, and 3. - [Zhongyue Zhang](https://github.com/Jerryzcn) - [Jake VanderPlas](http://www.vanderplas.com) -## Other Contributors +## Other Contributors - Xiao Wang: lazy rmetric, Nystrom Extension diff --git a/doc/installation.rst b/doc/installation.rst index 8abf57c..25e6b91 100644 --- a/doc/installation.rst +++ b/doc/installation.rst @@ -30,10 +30,11 @@ To install ``megaman`` from source requires the following: Optional requirements include: - pyamg_, which provides fast decompositions of large sparse matrices +- pysamg_, which provides fast decompositions of large sparse matrices. This needs and is included in the commercial software Fraunhofer SAMG. - pyflann_, which offers an alternative FLANN interface for computing distance matrices (this is bundled with the FLANN source code) - nose_ for running the unit tests -These requirements can be installed on Linux and MacOSX using the following conda command:: +These requirements (except SAMG) can be installed on Linux and MacOSX using the following conda command:: $ conda install --channel=jakevdp pip nose coverage gcc cython numpy scipy scikit-learn pyflann pyamg @@ -60,6 +61,7 @@ or, outside the source directory once ``megaman`` is installed:: .. _scikit-learn: http://scikit-learn.org .. _FLANN: http://www.cs.ubc.ca/research/flann/ .. _pyamg: http://pyamg.org/ +.. _pysamg: http://scai.fraunhofer.de/samg/ .. _pyflann: http://www.cs.ubc.ca/research/flann/ .. _nose: https://nose.readthedocs.org/ .. _cython: http://cython.org/ diff --git a/megaman/embedding/isomap.py b/megaman/embedding/isomap.py index acb320c..814294e 100644 --- a/megaman/embedding/isomap.py +++ b/megaman/embedding/isomap.py @@ -34,7 +34,7 @@ def isomap(geom, n_components=8, eigen_solver='auto', geom : a Geometry object from megaman.geometry.geometry n_components : integer, optional The dimension of the projection subspace. - eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', or 'amg'} + eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', 'amg' or 'samg'} 'auto' : algorithm will attempt to choose the best method for input data 'dense' : @@ -52,9 +52,17 @@ def isomap(geom, n_components=8, eigen_solver='auto', 'amg' : AMG requires pyamg to be installed. It can be faster on very large, sparse problems, but may also lead to instabilities. + 'samg' : + Algebraic Multigrid solver from Fraunhofer SCAI (requires + ``Fraunhofer SAMG`` and ``pysamg`` to be installed). It can be + significantly faster on very large, sparse problems. Note that SAMG + is a commercial product and one needs a license to use it. For + licensing (including test or educational licenses) + contact samg@scai.fraunhofer.de random_state : int seed, RandomState instance, or None (default) A pseudo random number generator used for the initialization of the - lobpcg eigen vectors decomposition when eigen_solver == 'amg'. + lobpcg eigen vectors decomposition when eigen_solver == 'amg' or + eigen_solver == 'samg'. By default, arpack is used. path_method : string, method for computing graph shortest path. One of : 'auto', 'D', 'FW', 'BF', 'J'. See scipy.sparse.csgraph.shortest_path @@ -125,13 +133,12 @@ class Isomap(BaseEmbedding): specification of geometry parameters: keys are ["adjacency_method", "adjacency_kwds", "affinity_method", "affinity_kwds", "laplacian_method", "laplacian_kwds"] - eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', or 'amg'} + eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', 'amg' or 'samg'} 'auto' : algorithm will attempt to choose the best method for input data 'dense' : - use standard dense matrix operations for the eigenvalue - decomposition. Uses a dense data array, and thus should be avoided - for large problems. + use standard dense matrix operations for the eigenvalue decomposition. + For this method, M must be an array or matrix type. This method should be avoided for large problems. 'arpack' : use arnoldi iteration in shift-invert mode. For this method, M may be a dense matrix, sparse matrix, or general linear operator. @@ -144,6 +151,13 @@ class Isomap(BaseEmbedding): 'amg' : AMG requires pyamg to be installed. It can be faster on very large, sparse problems, but may also lead to instabilities. + 'samg' : + Algebraic Multigrid solver from Fraunhofer SCAI (requires + ``Fraunhofer SAMG`` and ``pysamg`` to be installed). It can be + significantly faster on very large, sparse problems. Note that SAMG + is a commercial product and one needs a license to use it. For + licensing (including test or educational licenses) + contact samg@scai.fraunhofer.de random_state : numpy.RandomState or int, optional The generator or seed used to determine the starting vector for arpack iterations. Defaults to numpy.random.RandomState @@ -192,17 +206,38 @@ def fit(self, X, y=None, input_type='data'): Interpret X as precomputed distance or adjacency graph computed from samples. - eigen_solver : {None, 'arpack', 'lobpcg', or 'amg'} - The eigenvalue decomposition strategy to use. AMG requires pyamg - to be installed. It can be faster on very large, sparse problems, - but may also lead to instabilities. + eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', 'amg' or 'samg'} + 'auto' : + algorithm will attempt to choose the best method for input data + 'dense' : + use standard dense matrix operations for the eigenvalue decomposition. + For this method, M must be an array or matrix type. This method should be avoided for large problems. + 'arpack' : + use arnoldi iteration in shift-invert mode. For this method, + M may be a dense matrix, sparse matrix, or general linear operator. + Warning: ARPACK can be unstable for some problems. It is best to + try several random seeds in order to check results. + 'lobpcg' : + Locally Optimal Block Preconditioned Conjugate Gradient Method. + A preconditioned eigensolver for large symmetric positive definite + (SPD) generalized eigenproblems. + 'amg' : + AMG requires pyamg to be installed. It can be faster on very large, + sparse problems, but may also lead to instabilities. + 'samg' : + Algebraic Multigrid solver from Fraunhofer SCAI (requires + ``Fraunhofer SAMG`` and ``pysamg`` to be installed). It can be + significantly faster on very large, sparse problems. Note that SAMG + is a commercial product and one needs a license to use it. For + licensing (including test or educational licenses) + contact samg@scai.fraunhofer.de Returns ------- self : object Returns the instance itself. """ - + X = self._validate_input(X, input_type) self.fit_geometry(X, input_type) diff --git a/megaman/embedding/locally_linear.py b/megaman/embedding/locally_linear.py index 78f0aa4..305b9f3 100644 --- a/megaman/embedding/locally_linear.py +++ b/megaman/embedding/locally_linear.py @@ -70,7 +70,7 @@ def locally_linear_embedding(geom, n_components, reg=1e-3, reg : float regularization constant, multiplies the trace of the local covariance matrix of the distances. - eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', or 'amg'} + eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', 'amg' or 'samg'} 'auto' : algorithm will attempt to choose the best method for input data 'dense' : @@ -88,6 +88,13 @@ def locally_linear_embedding(geom, n_components, reg=1e-3, 'amg' : AMG requires pyamg to be installed. It can be faster on very large, sparse problems, but may also lead to instabilities. + 'samg' : + Algebraic Multigrid solver from Fraunhofer SCAI (requires + ``Fraunhofer SAMG`` and ``pysamg`` to be installed). It can be + significantly faster on very large, sparse problems. Note that SAMG + is a commercial product and one needs a license to use it. For + licensing (including test or educational licenses) + contact samg@scai.fraunhofer.de random_state : numpy.RandomState or int, optional The generator or seed used to determine the starting vector for arpack iterations. Defaults to numpy.random. @@ -143,13 +150,12 @@ class LocallyLinearEmbedding(BaseEmbedding): specification of geometry parameters: keys are ["adjacency_method", "adjacency_kwds", "affinity_method", "affinity_kwds", "laplacian_method", "laplacian_kwds"] - eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', or 'amg'} + eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', 'amg' or 'samg'} 'auto' : algorithm will attempt to choose the best method for input data 'dense' : - use standard dense matrix operations for the eigenvalue - decomposition. Uses a dense data array, and thus should be avoided - for large problems. + use standard dense matrix operations for the eigenvalue decomposition. + For this method, M must be an array or matrix type. This method should be avoided for large problems. 'arpack' : use arnoldi iteration in shift-invert mode. For this method, M may be a dense matrix, sparse matrix, or general linear operator. @@ -162,6 +168,13 @@ class LocallyLinearEmbedding(BaseEmbedding): 'amg' : AMG requires pyamg to be installed. It can be faster on very large, sparse problems, but may also lead to instabilities. + 'samg' : + Algebraic Multigrid solver from Fraunhofer SCAI (requires + ``Fraunhofer SAMG`` and ``pysamg`` to be installed). It can be + significantly faster on very large, sparse problems. Note that SAMG + is a commercial product and one needs a license to use it. For + licensing (including test or educational licenses) + contact samg@scai.fraunhofer.de random_state : numpy.RandomState or int, optional The generator or seed used to determine the starting vector for arpack iterations. Defaults to numpy.random.RandomState diff --git a/megaman/embedding/ltsa.py b/megaman/embedding/ltsa.py index 23e6d0f..377c71d 100644 --- a/megaman/embedding/ltsa.py +++ b/megaman/embedding/ltsa.py @@ -30,7 +30,7 @@ def ltsa(geom, n_components, eigen_solver='auto', geom : a Geometry object from megaman.geometry.geometry n_components : integer number of coordinates for the manifold. - eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', or 'amg'} + eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', 'amg' or 'samg'} 'auto' : algorithm will attempt to choose the best method for input data 'dense' : @@ -48,6 +48,13 @@ def ltsa(geom, n_components, eigen_solver='auto', 'amg' : AMG requires pyamg to be installed. It can be faster on very large, sparse problems, but may also lead to instabilities. + 'samg' : + Algebraic Multigrid solver from Fraunhofer SCAI (requires + ``Fraunhofer SAMG`` and ``pysamg`` to be installed). It can be + significantly faster on very large, sparse problems. Note that SAMG + is a commercial product and one needs a license to use it. For + licensing (including test or educational licenses) + contact samg@scai.fraunhofer.de random_state : numpy.RandomState or int, optional The generator or seed used to determine the starting vector for arpack iterations. Defaults to numpy.random. @@ -125,13 +132,12 @@ class LTSA(BaseEmbedding): specification of geometry parameters: keys are ["adjacency_method", "adjacency_kwds", "affinity_method", "affinity_kwds", "laplacian_method", "laplacian_kwds"] - eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', or 'amg'} + eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', 'amg' or 'samg'} 'auto' : algorithm will attempt to choose the best method for input data 'dense' : - use standard dense matrix operations for the eigenvalue - decomposition. Uses a dense data array, and thus should be avoided - for large problems. + use standard dense matrix operations for the eigenvalue decomposition. + For this method, M must be an array or matrix type. This method should be avoided for large problems. 'arpack' : use arnoldi iteration in shift-invert mode. For this method, M may be a dense matrix, sparse matrix, or general linear operator. @@ -144,6 +150,13 @@ class LTSA(BaseEmbedding): 'amg' : AMG requires pyamg to be installed. It can be faster on very large, sparse problems, but may also lead to instabilities. + 'samg' : + Algebraic Multigrid solver from Fraunhofer SCAI (requires + ``Fraunhofer SAMG`` and ``pysamg`` to be installed). It can be + significantly faster on very large, sparse problems. Note that SAMG + is a commercial product and one needs a license to use it. For + licensing (including test or educational licenses) + contact samg@scai.fraunhofer.de random_state : numpy.RandomState or int, optional The generator or seed used to determine the starting vector for arpack iterations. Defaults to numpy.random.RandomState diff --git a/megaman/embedding/spectral_embedding.py b/megaman/embedding/spectral_embedding.py index 6cd8ecd..1c76124 100644 --- a/megaman/embedding/spectral_embedding.py +++ b/megaman/embedding/spectral_embedding.py @@ -112,13 +112,13 @@ def spectral_embedding(geom, n_components=8, eigen_solver='auto', The ``adjacency`` variable is not strictly the adjacency matrix of a graph but more generally an affinity or similarity matrix between samples (for instance the heat kernel of a euclidean distance matrix or a k-NN matrix). - The Laplacian must be symmetric so that the eigen vector decomposition works as expected. + The Laplacian must be symmetric so that the eigen vector decomposition works as expected. This is ensured by the default setting (for more details, see the documentation in geometry.py). - + The data and generic geometric parameters are passed via a Geometry object, which also computes the Laplacian. By default, the 'geometric' Laplacian (or "debiased", or "renormalized" with - alpha=1) is used. This is the Laplacian construction defined in [Coifman and Lafon, 2006] (see also + alpha=1) is used. This is the Laplacian construction defined in [Coifman and Lafon, 2006] (see also documentation in laplacian.py). Thus, with diffusion_maps=False, spectral embedding is a modification of the Laplacian Eigenmaps algorithm of [Belkin and Nyiogi, 2002], with diffusion_maps=False, geom.laplacian_method ='symmetricnormalized' it is exactly the Laplacian Eigenmaps, with diffusion_maps=True, diffusion_time>0 it @@ -130,7 +130,7 @@ def spectral_embedding(geom, n_components=8, eigen_solver='auto', geom : a Geometry object from megaman.embedding.geometry n_components : integer, optional The dimension of the projection subspace. - eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', or 'amg'} + eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', 'amg' or 'samg'} 'auto' : algorithm will attempt to choose the best method for input data 'dense' : @@ -148,9 +148,17 @@ def spectral_embedding(geom, n_components=8, eigen_solver='auto', 'amg' : AMG requires pyamg to be installed. It can be faster on very large, sparse problems, but may also lead to instabilities. + 'samg' : + Algebraic Multigrid solver from Fraunhofer SCAI (requires + ``Fraunhofer SAMG`` and ``pysamg`` to be installed). It can be + significantly faster on very large, sparse problems. Note that SAMG + is a commercial product and one needs a license to use it. For + licensing (including test or educational licenses) + contact samg@scai.fraunhofer.de random_state : int seed, RandomState instance, or None (default) A pseudo random number generator used for the initialization of the - lobpcg eigen vectors decomposition when eigen_solver == 'amg'. + lobpcg eigen vectors decomposition when eigen_solver == 'amg' + or eigen_solver == 'samg'. By default, arpack is used. drop_first : bool, optional, default=True Whether to drop the first eigenvector. For spectral embedding, this @@ -158,10 +166,10 @@ def spectral_embedding(geom, n_components=8, eigen_solver='auto', connected graph, but for spectral clustering, this should be kept as False to retain the first eigenvector. diffusion_map : boolean, optional. Whether to return the diffusion map - version by re-scaling the embedding coordinate by the eigenvalues to the power + version by re-scaling the embedding coordinate by the eigenvalues to the power diffusion_time. - diffusion_time: if diffusion_map=True, the eigenvectors of the Laplacian are rescaled by - (1-lambda)^diffusion_time, where lambda is the corresponding eigenvalue. + diffusion_time: if diffusion_map=True, the eigenvectors of the Laplacian are rescaled by + (1-lambda)^diffusion_time, where lambda is the corresponding eigenvalue. diffusion_time has the role of scale parameter. One of the main ideas of diffusion framework is that running the diffusion forward in time (taking larger and larger powers of the Laplacian/transition matrix) reveals the geometric structure of X at larger and @@ -214,12 +222,12 @@ def spectral_embedding(geom, n_components=8, eigen_solver='auto', nvec=n_components + 1) re_normalize = False PD_solver = False - if eigen_solver in ['amg', 'lobpcg']: # these methods require a symmetric positive definite matrix! + if eigen_solver in ['samg', 'amg', 'lobpcg']: # these methods require a symmetric positive definite matrix! epsilon = 2 PD_solver = True if lapl_type not in ['symmetricnormalized', 'unnormalized']: re_normalize = True - # If lobpcg (or amg with lobpcg) is chosen and + # If lobpcg (or amg/samg with lobpcg) is chosen and # If the Laplacian is non-symmetric then we need to extract: # the w (weight) vector from geometry # and the symmetric Laplacian = S. @@ -267,10 +275,10 @@ def spectral_embedding(geom, n_components=8, eigen_solver='auto', if re_normalize: diffusion_map /= np.sqrt(w[:, np.newaxis]) # put back on original Laplacian space diffusion_map /= np.linalg.norm(diffusion_map, axis = 0) # norm 1 vectors - # sort the eigenvalues + # sort the eigenvalues ind = np.argsort(lambdas); ind = ind[::-1] lambdas = lambdas[ind]; lambdas[0] = 0 - diffusion_map = diffusion_map[:, ind] + diffusion_map = diffusion_map[:, ind] eigenvalues = lambdas.copy() eigenvectors = diffusion_map.copy() if diffusion_maps: @@ -307,13 +315,12 @@ class SpectralEmbedding(BaseEmbedding): specification of geometry parameters: keys are ["adjacency_method", "adjacency_kwds", "affinity_method", "affinity_kwds", "laplacian_method", "laplacian_kwds"] - eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', or 'amg'} + eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', 'amg' or 'samg'} 'auto' : algorithm will attempt to choose the best method for input data 'dense' : - use standard dense matrix operations for the eigenvalue - decomposition. Uses a dense data array, and thus should be avoided - for large problems. + use standard dense matrix operations for the eigenvalue decomposition. + For this method, M must be an array or matrix type. This method should be avoided for large problems. 'arpack' : use arnoldi iteration in shift-invert mode. For this method, M may be a dense matrix, sparse matrix, or general linear operator. @@ -326,6 +333,13 @@ class SpectralEmbedding(BaseEmbedding): 'amg' : AMG requires pyamg to be installed. It can be faster on very large, sparse problems, but may also lead to instabilities. + 'samg' : + Algebraic Multigrid solver from Fraunhofer SCAI (requires + ``Fraunhofer SAMG`` and ``pysamg`` to be installed). It can be + significantly faster on very large, sparse problems. Note that SAMG + is a commercial product and one needs a license to use it. For + licensing (including test or educational licenses) + contact samg@scai.fraunhofer.de random_state : numpy.RandomState or int, optional The generator or seed used to determine the starting vector for arpack iterations. Defaults to numpy.random.RandomState @@ -368,7 +382,7 @@ def __init__(self, n_components=2, radius=None, geom=None, def fit(self, X, y=None, input_type='data'): """ Fit the model from data in X. - + Parameters ---------- input_type : string, one of: 'data', 'distance' or 'affinity'. @@ -378,14 +392,14 @@ def fit(self, X, y=None, input_type='data'): and n_features is the number of features. If self.input_type is distance, or affinity: - + X : array-like, shape (n_samples, n_samples), Interpret X as precomputed distance or adjacency graph computed from samples. Returns ------- - self : object + self : object Returns the instance itself. """ X = self._validate_input(X, input_type) @@ -407,9 +421,9 @@ def fit(self, X, y=None, input_type='data'): def predict(self, X_test, y=None): """ Predict embedding on new data X_test given the existing embedding on training data - + Uses the Nystrom Extension to estimate the eigenvectors. - + Currently only works with input_type data (i.e. not affinity or distance) """ if not hasattr(self, 'geom_'): @@ -423,7 +437,7 @@ def predict(self, X_test, y=None): cyflann_kwds = adjacency_kwds['cyflann_kwds'] else: cyflann_kwds = {} - total_adjacency_matrix = complete_adjacency_matrix(self.geom_.adjacency_matrix, + total_adjacency_matrix = complete_adjacency_matrix(self.geom_.adjacency_matrix, self.geom_.X, X_test,adjacency_kwds) # Compute the affinity matrix, check method and kwds @@ -461,4 +475,4 @@ def predict(self, X_test, y=None): embedding = eigenvectors (n_sample_test) = X_test.shape[0] embedding_test=embedding[-n_sample_test:, :] - return embedding_test, embedding \ No newline at end of file + return embedding_test, embedding diff --git a/megaman/utils/eigendecomp.py b/megaman/utils/eigendecomp.py index b3e4fca..7338821 100644 --- a/megaman/utils/eigendecomp.py +++ b/megaman/utils/eigendecomp.py @@ -10,9 +10,10 @@ from .validation import check_array -EIGEN_SOLVERS = ['auto', 'dense', 'arpack', 'lobpcg'] +EIGEN_SOLVERS = ['auto', 'dense', 'arpack', 'lobpcg', 'samg'] BAD_EIGEN_SOLVERS = {} -AMG_KWDS = ['strength', 'aggregate', 'smooth', 'max_levels', 'max_coarse'] +AMG_KWDS = ['strength', 'aggregate', 'smooth', 'max_levels', 'max_coarse', + 'samg_param'] try: from pyamg import smoothed_aggregation_solver @@ -24,6 +25,18 @@ but pyamg is not available. Please either install pyamg or use another method.""" +try: + from pysamg import SAMG + SAMG_LOADED = True + EIGEN_SOLVERS.append('samg') +except ImportError: + SAMG_LOADED = False + BAD_EIGEN_SOLVERS['samg'] = """The eigen_solver was set to 'samg', + but SAMG is not available. Please either + install SAMG or use another method. For more information and + licensing (including test or educational licenses) + contact samg@scai.fraunhofer.de """ + def check_eigen_solver(eigen_solver, solver_kwds, size=None, nvec=None): """Check that the selected eigensolver is valid @@ -59,7 +72,10 @@ def check_eigen_solver(eigen_solver, solver_kwds, size=None, nvec=None): elif eigen_solver == 'auto': if size > 200 and nvec < 10: - if PYAMG_LOADED: + if SAMG_LOADED: + eigen_solver = 'samg' + solver_kwds = None + elif PYAMG_LOADED: eigen_solver = 'amg' solver_kwds = None else: @@ -92,7 +108,7 @@ def eigen_decomposition(G, n_components=8, eigen_solver='auto', The square matrix for which to compute the eigen-decomposition. n_components : integer, optional The number of eigenvectors to return - eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', or 'amg'} + eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', 'amg' or 'samg'} 'auto' : attempt to choose the best method for input data (default) 'dense' : @@ -112,9 +128,17 @@ def eigen_decomposition(G, n_components=8, eigen_solver='auto', Algebraic Multigrid solver (requires ``pyamg`` to be installed) It can be faster on very large, sparse problems, but may also lead to instabilities. + 'samg' : + Algebraic Multigrid solver from Fraunhofer SCAI (requires + ``Fraunhofer SAMG`` and ``pysamg`` to be installed). It can be + significantly faster on very large, sparse problems. Note that SAMG + is a commercial product and one needs a license to use it. For + licensing (including test or educational licenses) + contact samg@scai.fraunhofer.de random_state : int seed, RandomState instance, or None (default) A pseudo random number generator used for the initialization of the - lobpcg eigen vectors decomposition when eigen_solver == 'amg'. + lobpcg eigen vectors decomposition when eigen_solver == 'amg' + or eigen_solver == 'samg'. By default, arpack is used. solver_kwds : any additional keyword arguments to pass to the selected eigen_solver @@ -179,6 +203,63 @@ def eigen_decomposition(G, n_components=8, eigen_solver='auto', # Use AMG to get a preconditioner and speed up the eigenvalue problem. ml = smoothed_aggregation_solver(check_array(G, accept_sparse = ['csr']),**(amg_kwds or {})) M = ml.aspreconditioner() + n_find = min(n_nodes, 5 + 2*n_components) + X = random_state.rand(n_nodes, n_find) + X[:, 0] = (G.diagonal()).ravel() + lambdas, diffusion_map = lobpcg(G, X, M=M, largest=largest,**(lobpcg_kwds or {})) + sort_order = np.argsort(lambdas) + if largest: + lambdas = lambdas[sort_order[::-1]] + diffusion_map = diffusion_map[:, sort_order[::-1]] + else: + lambdas = lambdas[sort_order] + diffusion_map = diffusion_map[:, sort_order] + lambdas = lambdas[:n_components] + diffusion_map = diffusion_map[:, :n_components] + elif eigen_solver == 'samg': + # separate samg & lobpcg keywords: + if solver_kwds is not None: + amg_kwds = {} + lobpcg_kwds = solver_kwds.copy() + for kwd in AMG_KWDS: + if kwd in solver_kwds.keys(): + amg_kwds[kwd] = solver_kwds[kwd] + del lobpcg_kwds[kwd] + else: + amg_kwds = dict() # Not None, because we may need it later, see below + lobpcg_kwds = None + if not is_symmetric: + raise ValueError("lobpcg requires symmetric matrices.") + if not sparse.issparse(G): + warnings.warn("AMG works better for sparse matrices") + + ### Use SAMG to get a preconditioner and speed up the eigenvalue problem. + G = check_array(G, accept_sparse = ['csr']) # Ensure we're dealing with + # CSR matrix + + ## Determine if the matrix may be too dense for standard SAMG parameters + ## (only if the user didn't supply own SAMG parameters) + if ('samg_param' not in amg_kwds or all(x not in amg_kwds['samg_param'] + for x in ['ncg','ecg','a_cmplx','g_cmplx','w_avrge','nwt'])): + density = G.getnnz()/G.shape[0] # Calculate the entries per row + if density >= 40: + warnings.warn("The given matrix has more than 40 entries per " + "row: switching to a more agressive coarsening to" + "prevent overuse of memory.") + if 'samg_param' not in amg_kwds: + amg_kwds['samg_param'] = dict() + # aggregative coarsening + amg_kwds['samg_param'].update({'ncg':171, 'ecg':21.9, + 'a_cmplx':4.5, 'g_cmplx':1.3, + 'w_avrge':1.7, 'nwt':1}) + + elif density >= 20: + warnings.warn("The given matrix has more than 20 entries per row:" + "standard SAMG parameters may use to much memory.") + + s = SAMG.Solver(G, **(amg_kwds or {})) + M = s.aspreconditioner() + n_find = min(n_nodes, 5 + 2*n_components) X = random_state.rand(n_nodes, n_find) X[:, 0] = (G.diagonal()).ravel() @@ -235,12 +316,13 @@ def null_space(M, k, k_skip=1, eigen_solver='arpack', Number of eigenvalues/vectors to return k_skip : integer, optional Number of low eigenvalues to skip. - eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', or 'amg'} + eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', 'amg' or 'samg'} 'auto' : algorithm will attempt to choose the best method for input data 'dense' : use standard dense matrix operations for the eigenvalue decomposition. - For this method, M must be an array or matrix type. This method should be avoided for large problems. + For this method, M must be an array or matrix type. + This method should be avoided for large problems. 'arpack' : use arnoldi iteration in shift-invert mode. For this method, M may be a dense matrix, sparse matrix, or general linear operator. @@ -253,6 +335,13 @@ def null_space(M, k, k_skip=1, eigen_solver='arpack', 'amg' : AMG requires pyamg to be installed. It can be faster on very large, sparse problems, but may also lead to instabilities. + 'samg' : + Algebraic Multigrid solver from Fraunhofer SCAI (requires + ``Fraunhofer SAMG`` and ``pysamg`` to be installed). It can be + significantly faster on very large, sparse problems. Note that SAMG + is a commercial product and one needs a license to use it. For + licensing (including test or educational licenses) + contact samg@scai.fraunhofer.de random_state: numpy.RandomState or int, optional The generator or seed used to determine the starting vector for arpack iterations. Defaults to numpy.random. @@ -262,12 +351,12 @@ def null_space(M, k, k_skip=1, eigen_solver='arpack', ------- null_space : estimated k vectors of the null space error : estimated error (sum of eigenvalues) - + Notes ----- dense solver key words: see http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.linalg.eigh.html - for symmetric problems and + for symmetric problems and http://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.linalg.eig.html#scipy.linalg.eig for non symmetric problems. arpack sovler key words: see @@ -314,7 +403,7 @@ def null_space(M, k, k_skip=1, eigen_solver='arpack', # M, eigvals=(k_skip, k + k_skip - 1), overwrite_a=True) # index = np.argsort(np.abs(eigen_values)) # return eigen_vectors[:, index], np.sum(eigen_values) - elif (eigen_solver == 'amg' or eigen_solver == 'lobpcg'): + elif (eigen_solver == 'amg' or eigen_solver == 'samg' or eigen_solver == 'lobpcg'): # M should be positive semi-definite. Add 1 to make it pos. def. try: M = sparse.identity(M.shape[0]) + M diff --git a/megaman/utils/spectral_clustering.py b/megaman/utils/spectral_clustering.py index 4cee32a..f62a8ac 100644 --- a/megaman/utils/spectral_clustering.py +++ b/megaman/utils/spectral_clustering.py @@ -15,14 +15,14 @@ class SpectralClustering(BaseEmbedding): """ - Spectral clustering for find K clusters by using the eigenvectors of a + Spectral clustering for find K clusters by using the eigenvectors of a matrix which is derived from a set of similarities S. Parameters ----------- K: integer number of K clusters - eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', or 'amg'} + eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', 'amg' or 'samg'} 'auto' : algorithm will attempt to choose the best method for input data 'dense' : @@ -39,31 +39,38 @@ class SpectralClustering(BaseEmbedding): (SPD) generalized eigenproblems. 'amg' : AMG requires pyamg to be installed. It can be faster on very large, - sparse problems, but may also lead to instabilities. - + sparse problems, but may also lead to instabilities. + 'samg' : + Algebraic Multigrid solver from Fraunhofer SCAI (requires + ``Fraunhofer SAMG`` and ``pysamg`` to be installed). It can be + significantly faster on very large, sparse problems. Note that SAMG + is a commercial product and one needs a license to use it. For + licensing (including test or educational licenses) + contact samg@scai.fraunhofer.de + random_state : numpy.RandomState or int, optional The generator or seed used to determine the starting vector for arpack - iterations. Defaults to numpy.random.RandomState - solver_kwds : any additional keyword arguments to pass to the selected eigen_solver - renormalize : (bool) whether or not to set the rows of the eigenvectors to have norm 1 + iterations. Defaults to numpy.random.RandomState + solver_kwds : any additional keyword arguments to pass to the selected eigen_solver + renormalize : (bool) whether or not to set the rows of the eigenvectors to have norm 1 this can improve label quality stabalize : (bool) whether or not to compute the (more stable) eigenvectors of L = D^-1/2*S*D^-1/2 - instead of P = D^-1*S - """ - def __init__(self,K,eigen_solver='auto', + instead of P = D^-1*S + """ + def __init__(self,K,eigen_solver='auto', random_state=None, solver_kwds = None, geom = None, radius = None, renormalize = True, stabalize = True, additional_vectors=0): self.eigen_solver = eigen_solver - self.random_state = random_state + self.random_state = random_state self.K = K - self.solver_kwds = solver_kwds + self.solver_kwds = solver_kwds self.geom = geom self.radius = radius self.renormalize = renormalize self.stabalize = stabalize self.additional_vectors = 0 - + def fit(self, X, y=None, input_type='affinity'): """ Fit the model from data in X. @@ -78,32 +85,32 @@ def fit(self, X, y=None, input_type='affinity'): If self.input_type is similarity: X : array-like, shape (n_samples, n_samples), - copy the similarity matrix X to S. + copy the similarity matrix X to S. """ X = self._validate_input(X, input_type) self.fit_geometry(X, input_type) - random_state = check_random_state(self.random_state) - self.embedding_, self.eigen_vectors_, self.P_ = spectral_clustering(self.geom_, K = self.K, - eigen_solver = self.eigen_solver, - random_state = self.random_state, + random_state = check_random_state(self.random_state) + self.embedding_, self.eigen_vectors_, self.P_ = spectral_clustering(self.geom_, K = self.K, + eigen_solver = self.eigen_solver, + random_state = self.random_state, solver_kwds = self.solver_kwds, renormalize = self.renormalize, stabalize = self.stabalize, - additional_vectors = self.additional_vectors) - -def spectral_clustering(geom, K, eigen_solver = 'dense', random_state = None, solver_kwds = None, + additional_vectors = self.additional_vectors) + +def spectral_clustering(geom, K, eigen_solver = 'dense', random_state = None, solver_kwds = None, renormalize = True, stabalize = True, additional_vectors = 0): """ - Spectral clustering for find K clusters by using the eigenvectors of a + Spectral clustering for find K clusters by using the eigenvectors of a matrix which is derived from a set of similarities S. Parameters ----------- S: array-like,shape(n_sample,n_sample) - similarity matrix + similarity matrix K: integer number of K clusters - eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', or 'amg'} + eigen_solver : {'auto', 'dense', 'arpack', 'lobpcg', 'amg' or 'samg'} 'auto' : algorithm will attempt to choose the best method for input data 'dense' : @@ -120,33 +127,40 @@ def spectral_clustering(geom, K, eigen_solver = 'dense', random_state = None, so (SPD) generalized eigenproblems. 'amg' : AMG requires pyamg to be installed. It can be faster on very large, - sparse problems, but may also lead to instabilities. - + sparse problems, but may also lead to instabilities. + 'samg' : + Algebraic Multigrid solver from Fraunhofer SCAI (requires + ``Fraunhofer SAMG`` and ``pysamg`` to be installed). It can be + significantly faster on very large, sparse problems. Note that SAMG + is a commercial product and one needs a license to use it. For + licensing (including test or educational licenses) + contact samg@scai.fraunhofer.de + random_state : numpy.RandomState or int, optional The generator or seed used to determine the starting vector for arpack - iterations. Defaults to numpy.random.RandomState - solver_kwds : any additional keyword arguments to pass to the selected eigen_solver - renormalize : (bool) whether or not to set the rows of the eigenvectors to have norm 1 + iterations. Defaults to numpy.random.RandomState + solver_kwds : any additional keyword arguments to pass to the selected eigen_solver + renormalize : (bool) whether or not to set the rows of the eigenvectors to have norm 1 this can improve label quality stabalize : (bool) whether or not to compute the (more stable) eigenvectors of L = D^-1/2*S*D^-1/2 - instead of P = D^-1*S + instead of P = D^-1*S additional_vectors : (int) compute additional eigen vectors when computing eigen decomposition. - When eigen_solver = 'amg' or 'lopcg' often if a small number of eigen values is sought the + When eigen_solver = 'amg', 'samg' or 'lopcg' often if a small number of eigen values is sought the largest eigenvalue returned is *not* equal to 1 (it should be). This can usually be fixed by requesting more than K eigenvalues until the first eigenvalue is close to 1 and then - omitted. The remaining K-1 eigenvectors should be informative. + omitted. The remaining K-1 eigenvectors should be informative. Returns ------- labels: array-like, shape (1,n_samples) - """ + """ # Step 1: get similarity matrix if geom.affinity_matrix is None: S = geom.compute_affinity_matrix() else: S = geom.affinity_matrix - + # Check for stability method, symmetric solvers require this - if eigen_solver in ['lobpcg', 'amg']: + if eigen_solver in ['lobpcg', 'amg', 'samg']: stabalize = True if stabalize: geom.laplacian_type = 'symmetricnormalized' @@ -154,40 +168,40 @@ def spectral_clustering(geom, K, eigen_solver = 'dense', random_state = None, so else: geom.laplacian_type = 'randomwalk' return_lapsym = False - + # Step 2: get the Laplacian matrix P = geom.compute_laplacian_matrix(return_lapsym = return_lapsym) # by default the Laplacian is subtracted from the Identify matrix (this step may not be needed) - P += identity(P.shape[0]) - - # Step 3: Compute the top K eigenvectors and drop the first - if eigen_solver in ['auto', 'amg', 'lobpcg']: + P += identity(P.shape[0]) + + # Step 3: Compute the top K eigenvectors and drop the first + if eigen_solver in ['auto', 'amg', 'lobpcg', 'samg']: n_components = 2*int(np.log(P.shape[0]))*K + 1 n_components += int(additional_vectors) else: n_components = K n_components = min(n_components, P.shape[0]) - (lambdas, eigen_vectors) = eigen_decomposition(P, n_components=n_components, eigen_solver=eigen_solver, + (lambdas, eigen_vectors) = eigen_decomposition(P, n_components=n_components, eigen_solver=eigen_solver, random_state=random_state, drop_first = True, solver_kwds=solver_kwds) - # the first vector is usually uninformative - if eigen_solver in ['auto', 'lobpcg', 'amg']: + # the first vector is usually uninformative + if eigen_solver in ['auto', 'lobpcg', 'amg', 'samg']: if np.abs(lambdas[0] - 1) > 1e-4: warnings.warn("largest eigenvalue not equal to 1. Results may be poor. Try increasing additional_vectors parameter") eigen_vectors = eigen_vectors[:, 1:K] lambdas = lambdas[1:K] - + # If stability method chosen, adjust eigenvectors if stabalize: w = np.array(geom.laplacian_weights) eigen_vectors /= np.sqrt(w[:,np.newaxis]) - eigen_vectors /= np.linalg.norm(eigen_vectors, axis = 0) - + eigen_vectors /= np.linalg.norm(eigen_vectors, axis = 0) + # If renormalize: set each data point to unit length if renormalize: norms = np.linalg.norm(eigen_vectors, axis=1) eigen_vectors /= norms[:,np.newaxis] - + # Step 4: run k-means clustering - labels = k_means_clustering(eigen_vectors,K) - return labels, eigen_vectors, P \ No newline at end of file + labels = k_means_clustering(eigen_vectors,K) + return labels, eigen_vectors, P diff --git a/setup.py b/setup.py index 821d8b7..5ed777e 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,8 @@ def configuration(parent_package='',top_path=None): This repository contains a scalable implementation of several manifold learning algorithms, making use of FLANN for fast approximate nearest neighbors and -PyAMG, LOBPCG, ARPACK, and other routines for fast matrix decompositions. +PyAMG, Fraunhofer SAMG, LOBPCG, ARPACK, and other routines for fast matrix +decompositions. For more information, visit https://github.com/mmp2/megaman """