diff --git a/.circleci/config.yml b/.circleci/config.yml index 8f14cc420..4d1bc043c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -69,7 +69,7 @@ commands: jobs: # Build scikit-tree from source - build_scikit-tree: + build_scikit_tree: <<: *defaults steps: - checkout @@ -186,13 +186,16 @@ jobs: - ~/sktree workflows: - commit: + default: jobs: - - build_scikit-tree + - build_scikit_tree: + name: build_scikit_tree - build_docs: + name: build_docs requires: - - build_scikit-tree + - build_scikit_tree - docs-deploy: + name: docs-deploy requires: - build_docs filters: diff --git a/.github/workflows/circle_artifacts.yml b/.github/workflows/circle_artifacts.yml index f7b0fd189..980f070fb 100644 --- a/.github/workflows/circle_artifacts.yml +++ b/.github/workflows/circle_artifacts.yml @@ -3,12 +3,6 @@ on: [status] permissions: read-all -# Restrict the permissions granted to the use of secrets.GITHUB_TOKEN in this -# github actions workflow: -# https://docs.github.com/en/actions/security-guides/automatic-token-authentication -# permissions: -# statuses: write - jobs: circleci_artifacts_redirector_job: runs-on: ubuntu-latest diff --git a/docs/whats_new/v0.1.rst b/docs/whats_new/v0.1.rst index 4798f763b..db984a99f 100644 --- a/docs/whats_new/v0.1.rst +++ b/docs/whats_new/v0.1.rst @@ -34,6 +34,7 @@ Changelog - |Feature| All tree types can compute similarity and dissimilarity matrices, by `Sambit Panda`_ and `Adam Li`_ (:pr:`64`) - |Feature| MORF trees now can normalize by feature weight per sample per feature column, by `Adam Li`_ (:pr:`67`) - |Feature| Implementation of ObliqueDecisionTreeRegressor, PatchObliqueDecisionTreeRegressor, ObliqueRandomForestRegressor, PatchObliqueRandomForestRegressor, by `SUKI-O`_ (:pr:`72`) +- |Feature| A general-kernel MORF is now implemented where users can pass in a kernel library, by `Adam Li`_ (:pr:`70`) Code and Documentation Contributors ----------------------------------- diff --git a/examples/plot_kernel_decision_tree.py b/examples/plot_kernel_decision_tree.py new file mode 100644 index 000000000..2176228ea --- /dev/null +++ b/examples/plot_kernel_decision_tree.py @@ -0,0 +1,106 @@ +""" +====================================== +Custom Kernel Decision Tree Classifier +====================================== + +This example shows how to build a manifold oblique decision tree classifier using +a custom set of user-defined kernel/filter library, such as the Gaussian, or Gabor +kernels. + +The example demonstrates superior performance on a 2D dataset with structured images +as samples. The dataset is the downsampled MNIST dataset, where each sample is a +28x28 image. The dataset is downsampled to 14x14, and then flattened to a 196 +dimensional vector. The dataset is then split into a training and testing set. + +See :ref:`sphx_glr_auto_examples_plot_projection_matrices` for more information on +projection matrices and the way they can be sampled. +""" +import matplotlib.pyplot as plt + +# %% +# Importing the necessary modules +import numpy as np +from sklearn.datasets import fetch_openml +from sklearn.metrics import accuracy_score +from sklearn.model_selection import train_test_split + +from sktree.tree import KernelDecisionTreeClassifier + +# %% +# Load the Dataset +# ---------------- +# We need to load the dataset and split it into training and testing sets. + +# Load the dataset +X, y = fetch_openml("mnist_784", version=1, return_X_y=True) + +# Downsample the dataset +X = X.reshape((-1, 28, 28)) +X = X[:, ::2, ::2] +X = X.reshape((-1, 196)) + +# Split the dataset into training and testing sets +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + +# %% +# Setting up the Custom Kernel Decision Tree Model +# ------------------------------------------------- +# To set up the custom kernel decision tree model, we need to define typical hyperparameters +# for the decision tree classifier, such as the maximum depth of the tree and the minimum +# number of samples required to split an internal node. For the Kernel Decision tree model, +# we also need to define the kernel function and its parameters. + +max_depth = 10 +min_samples_split = 2 + +# Next, we define the hyperparameters for the custom kernels that we will use. +# For example, if we want to use a Gaussian kernel with a sigma of 1.0 and a size of 3x3: +kernel_function = "gaussian" +kernel_params = {"sigma": 1.0, "size": (3, 3)} + +# We can then fit the custom kernel decision tree model to the training set: +clf = KernelDecisionTreeClassifier( + max_depth=max_depth, + min_samples_split=min_samples_split, + data_dims=(28, 28), + min_patch_dims=(1, 1), + max_patch_dims=(14, 14), + dim_contiguous=(True, True), + boundary=None, + n_classes=10, + kernel_function=kernel_function, + n_kernels=500, + store_kernel_library=True, +) + +# Fit the decision tree classifier using the custom kernel +clf.fit(X_train, y_train) + +# %% +# Evaluating the Custom Kernel Decision Tree Model +# ------------------------------------------------ +# To evaluate the custom kernel decision tree model, we can use the testing set. +# We can also inspect the important kernels that the tree selected. + +# Predict the labels for the testing set +y_pred = clf.predict(X_test) + +# Compute the accuracy score +accuracy = accuracy_score(y_test, y_pred) + +print(f"Kernel decision tree model obtained an accuracy of {accuracy} on MNIST.") + +# Get the important kernels from the decision tree classifier +important_kernels = clf.kernel_arr_ +kernel_dims = clf.kernel_dims_ +kernel_params = clf.kernel_params_ +kernel_library = clf.kernel_library_ + +# Plot the important kernels +fig, axes = plt.subplots( + nrows=len(important_kernels), ncols=1, figsize=(6, 4 * len(important_kernels)) +) +for i, kernel in enumerate(important_kernels): + axes[i].imshow(kernel, cmap="gray") + axes[i].set_title("Kernel {}".format(i + 1)) +plt.show() diff --git a/examples/plot_projection_matrices.py b/examples/plot_projection_matrices.py index ef2f0d95f..30a087fe8 100644 --- a/examples/plot_projection_matrices.py +++ b/examples/plot_projection_matrices.py @@ -265,3 +265,40 @@ fig.suptitle("2D Discontiguous Patch Visualization") plt.show() + +# %% +# We will make the patch 2D, which samples multiple rows contiguously. This is +# a 2D patch of size 3 in the columns and 2 in the rows. +dim_contiguous = np.array((False, False)) + +splitter = BestPatchSplitterTester( + criterion, + max_features, + min_samples_leaf, + min_weight_leaf, + random_state, + min_patch_dims, + max_patch_dims, + dim_contiguous, + data_dims, + boundary, + feature_weight, +) +splitter.init_test(X, y, sample_weight) + +# sample the projection matrix that consists of 1D patches +proj_mat = splitter.sample_projection_matrix() + +# Visualize 2D patches +fig, axs = plt.subplots(nrows=3, ncols=3, figsize=(12, 8), sharex=True, sharey=True, squeeze=True) +axs = axs.flatten() +for idx, ax in enumerate(axs): + ax.imshow(proj_mat[idx, :].reshape(data_dims), cmap="viridis") + ax.set( + xlim=(-1, data_dims[1]), + ylim=(-1, data_dims[0]), + title=f"Patch {idx}", + ) + +fig.suptitle("2D Discontiguous In All Dims Patch Visualization") +plt.show() diff --git a/sktree/tree/__init__.py b/sktree/tree/__init__.py index 18e9d0588..2d793ffe7 100644 --- a/sktree/tree/__init__.py +++ b/sktree/tree/__init__.py @@ -1,4 +1,5 @@ from ._classes import ( + KernelDecisionTreeClassifier, ObliqueDecisionTreeClassifier, ObliqueDecisionTreeRegressor, PatchObliqueDecisionTreeClassifier, @@ -14,4 +15,5 @@ "ObliqueDecisionTreeRegressor", "PatchObliqueDecisionTreeClassifier", "PatchObliqueDecisionTreeRegressor", + "KernelDecisionTreeClassifier", ] diff --git a/sktree/tree/_classes.py b/sktree/tree/_classes.py index 6228def24..144c41b35 100644 --- a/sktree/tree/_classes.py +++ b/sktree/tree/_classes.py @@ -21,6 +21,7 @@ from ._neighbors import SimMatrixMixin from ._oblique_splitter import ObliqueSplitter from ._oblique_tree import ObliqueTree +from .kernels import gaussian_kernel from .manifold import _morf_splitter from .manifold._morf_splitter import PatchSplitter from .unsupervised import _unsup_criterion, _unsup_oblique_splitter, _unsup_splitter @@ -28,7 +29,7 @@ from .unsupervised._unsup_oblique_splitter import UnsupervisedObliqueSplitter from .unsupervised._unsup_oblique_tree import UnsupervisedObliqueTree from .unsupervised._unsup_splitter import UnsupervisedSplitter -from .unsupervised._unsup_tree import ( # type: ignore +from .unsupervised._unsup_tree import ( UnsupervisedBestFirstTreeBuilder, UnsupervisedDepthFirstTreeBuilder, UnsupervisedTree, @@ -2237,3 +2238,544 @@ def _build_tree( ) builder.build(self.tree_, X, y, sample_weight) + + +class KernelDecisionTreeClassifier(PatchObliqueDecisionTreeClassifier): + """Oblique decision tree classifier over data patches combined with Gaussian kernels. + + Parameters + ---------- + criterion : {"gini", "entropy"}, default="gini" + The function to measure the quality of a split. Supported criteria are + "gini" for the Gini impurity and "entropy" for the information gain. + + splitter : {"best", "random"}, default="best" + The strategy used to choose the split at each node. Supported + strategies are "best" to choose the best split and "random" to choose + the best random split. + + max_depth : int, default=None + The maximum depth of the tree. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node: + + - If int, then consider `min_samples_split` as the minimum number. + - If float, then `min_samples_split` is a fraction and + `ceil(min_samples_split * n_samples)` are the minimum + number of samples for each split. + + min_samples_leaf : int or float, default=1 + The minimum number of samples required to be at a leaf node. + A split point at any depth will only be considered if it leaves at + least ``min_samples_leaf`` training samples in each of the left and + right branches. This may have the effect of smoothing the model, + especially in regression. + + - If int, then consider `min_samples_leaf` as the minimum number. + - If float, then `min_samples_leaf` is a fraction and + `ceil(min_samples_leaf * n_samples)` are the minimum + number of samples for each node. + + min_weight_fraction_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights (of all + the input samples) required to be at a leaf node. Samples have + equal weight when sample_weight is not provided. + + max_features : int, float or {"auto", "sqrt", "log2"}, default=None + The number of features to consider when looking for the best split: + + - If int, then consider `max_features` features at each split. + - If float, then `max_features` is a fraction and + `int(max_features * n_features)` features are considered at each + split. + - If "auto", then `max_features=sqrt(n_features)`. + - If "sqrt", then `max_features=sqrt(n_features)`. + - If "log2", then `max_features=log2(n_features)`. + - If None, then `max_features=n_features`. + + Note: the search for a split does not stop until at least one + valid partition of the node samples is found, even if it requires to + effectively inspect more than ``max_features`` features. + + Note: Compared to axis-aligned Random Forests, one can set + max_features to a number greater then ``n_features``. + + random_state : int, RandomState instance or None, default=None + Controls the randomness of the estimator. The features are always + randomly permuted at each split, even if ``splitter`` is set to + ``"best"``. When ``max_features < n_features``, the algorithm will + select ``max_features`` at random at each split before finding the best + split among them. But the best found split may vary across different + runs, even if ``max_features=n_features``. That is the case, if the + improvement of the criterion is identical for several splits and one + split has to be selected at random. To obtain a deterministic behaviour + during fitting, ``random_state`` has to be fixed to an integer. + See :term:`Glossary ` for details. + + max_leaf_nodes : int, default=None + Grow a tree with ``max_leaf_nodes`` in best-first fashion. + Best nodes are defined as relative reduction in impurity. + If None then unlimited number of leaf nodes. + + min_impurity_decrease : float, default=0.0 + A node will be split if this split induces a decrease of the impurity + greater than or equal to this value. + + The weighted impurity decrease equation is the following:: + + N_t / N * (impurity - N_t_R / N_t * right_impurity + - N_t_L / N_t * left_impurity) + + where ``N`` is the total number of samples, ``N_t`` is the number of + samples at the current node, ``N_t_L`` is the number of samples in the + left child, and ``N_t_R`` is the number of samples in the right child. + + ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum, + if ``sample_weight`` is passed. + + class_weight : dict, list of dict or "balanced", default=None + Weights associated with classes in the form ``{class_label: weight}``. + If None, all classes are supposed to have weight one. For + multi-output problems, a list of dicts can be provided in the same + order as the columns of y. + + Note that for multioutput (including multilabel) weights should be + defined for each class of every column in its own dict. For example, + for four-class multilabel classification weights should be + [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of + [{1:1}, {2:5}, {3:1}, {4:1}]. + + The "balanced" mode uses the values of y to automatically adjust + weights inversely proportional to class frequencies in the input data + as ``n_samples / (n_classes * np.bincount(y))`` + + For multi-output, the weights of each column of y will be multiplied. + + Note that these weights will be multiplied with sample_weight (passed + through the fit method) if sample_weight is specified. + min_patch_dim : array-like, optional + The minimum dimensions of a patch, by default 1 along all dimensions. + max_patch_dim : array-like, optional + The maximum dimensions of a patch, by default 1 along all dimensions. + dim_contiguous : array-like of bool, optional + Whether or not each patch is sampled contiguously along this dimension. + data_dims : array-like, optional + The presumed dimensions of the un-vectorized feature vector, by default + will be a 1D vector with (1, n_features) shape. + boundary : optional, str {'wrap'} + The boundary condition to use when sampling patches, by default None. + 'wrap' corresponds to the boundary condition as is in numpy and scipy. + feature_weight : array-like of shape (n_samples,n_features,), default=None + Feature weights. If None, then features are equally weighted as is. + If provided, then the feature weights are used to weight the + patches that are generated. The feature weights are used + as follows: for every patch that is sampled, the feature weights over + the entire patch is summed and normalizes the patch. + kernel : str {'gaussian', 'uniform'}, default='gaussian' + The kernel to use. + n_kernels : int, optional + The number of different kernels to generate. This number should be very high + as this generates kernels + + Attributes + ---------- + classes_ : ndarray of shape (n_classes,) or list of ndarray + The classes labels (single output problem), + or a list of arrays of class labels (multi-output problem). + + feature_importances_ : ndarray of shape (n_features,) + The impurity-based feature importances. + The higher, the more important the feature. + The importance of a feature is computed as the (normalized) + total reduction of the criterion brought by that feature. It is also + known as the Gini importance [4]_. + + Warning: impurity-based feature importances can be misleading for + high cardinality features (many unique values). See + :func:`sklearn.inspection.permutation_importance` as an alternative. + + max_features_ : int + The inferred value of max_features. + + n_classes_ : int or list of int + The number of classes (for single output problems), + or a list containing the number of classes for each + output (for multi-output problems). + + n_features_in_ : int + Number of features seen during :term:`fit`. + + feature_names_in_ : ndarray of shape (`n_features_in_`,) + Names of features seen during :term:`fit`. Defined only when `X` + has feature names that are all strings. + + n_outputs_ : int + The number of outputs when ``fit`` is performed. + + tree_ : Tree instance + The underlying Tree object. Please refer to + ``help(sklearn.tree._tree.Tree)`` for + attributes of Tree object. + + min_patch_dims_ : array-like + The minimum dimensions of a patch. + + max_patch_dims_ : array-like + The maximum dimensions of a patch. + + data_dims_ : array-like + The presumed dimensions of the un-vectorized feature vector. + + kernel_arr_ : list of length (n_nodes,) of array-like of shape (patch_dims,) + The kernel array that is applied to the patches for this tree. + The order is in the same order in which the tree traverses the nodes. + + kernel_params_ : list of length (n_nodes,) + The parameters of the kernel that is applied to the patches for this tree. + The order is in the same order in which the tree traverses the nodes. + + kernel_library_ : array-like of shape (n_kernels,), optional + The library of kernels that was chosen from the patches for this tree. + Only stored if ``store_kernel_library`` is set to True. + + kernel_library_params_ : list of length (n_nodes,), optional + The parameters of the kernels that was chosen from the patches for this tree. + The order is in the same order in which the tree traverses the nodes. + Only stored if ``store_kernel_library`` is set to True. + + References + ---------- + .. footbibliography:: + """ + + _parameter_constraints = { + **DecisionTreeClassifier._parameter_constraints, + "min_patch_dims": ["array-like", None], + "max_patch_dims": ["array-like", None], + "data_dims": ["array-like", None], + "dim_contiguous": ["array-like", None], + "boundary": [str, None], + "feature_weight": ["array-like", None], + "kernel": ["str"], + "n_kernels": [int, None], + "store_kernel_library": [bool], + } + + def __init__( + self, + *, + criterion="gini", + splitter="best", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0.0, + max_features=None, + random_state=None, + max_leaf_nodes=None, + min_impurity_decrease=0.0, + class_weight=None, + min_patch_dims=None, + max_patch_dims=None, + dim_contiguous=None, + data_dims=None, + boundary=None, + feature_weight=None, + kernel="gaussian", + n_kernels=None, + store_kernel_library=False, + ): + super().__init__( + criterion=criterion, + splitter=splitter, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + min_weight_fraction_leaf=min_weight_fraction_leaf, + max_features=max_features, + max_leaf_nodes=max_leaf_nodes, + class_weight=class_weight, + random_state=random_state, + min_impurity_decrease=min_impurity_decrease, + ) + + self.min_patch_dims = min_patch_dims + self.max_patch_dims = max_patch_dims + self.dim_contiguous = dim_contiguous + self.data_dims = data_dims + self.boundary = boundary + self.feature_weight = feature_weight + + self.kernel = kernel + self.n_kernel = n_kernels + self.store_kernel_library = store_kernel_library + + def _build_tree( + self, + X, + y, + sample_weight, + min_samples_leaf, + min_weight_leaf, + max_leaf_nodes, + min_samples_split, + max_depth, + random_state, + ): + """Build the actual tree. + + Parameters + ---------- + X : {array-like, sparse matrix} of shape (n_samples, n_features) + The training input samples. Internally, it will be converted to + ``dtype=np.float32`` and if a sparse matrix is provided + to a sparse ``csc_matrix``. + y : array-like of shape (n_samples,) or (n_samples, n_outputs) + The target values (class labels) as integers or strings. + sample_weight : array-like of shape (n_samples,), default=None + Sample weights. If None, then samples are equally weighted. Splits + that would create child nodes with net zero or negative weight are + ignored while searching for a split in each node. Splits are also + ignored if they would result in any single class carrying a + negative weight in either child node. + min_samples_leaf : int or float + The minimum number of samples required to be at a leaf node. + min_weight_leaf : float, default=0.0 + The minimum weighted fraction of the sum total of weights. + max_leaf_nodes : int, default=None + Grow a tree with ``max_leaf_nodes`` in best-first fashion. + min_samples_split : int or float, default=2 + The minimum number of samples required to split an internal node: + max_depth : int, default=None + The maximum depth of the tree. If None, then nodes are expanded until + all leaves are pure or until all leaves contain less than + min_samples_split samples. + random_state : int, RandomState instance or None, default=None + Controls the randomness of the estimator. + """ + # Build tree + criterion = self.criterion + if not isinstance(criterion, BaseCriterion): + criterion = CRITERIA_CLF[self.criterion](self.n_outputs_, self.n_classes_) + else: + # Make a deepcopy in case the criterion has mutable attributes that + # might be shared and modified concurrently during parallel fitting + criterion = copy.deepcopy(criterion) + + splitter = self.splitter + if issparse(X): + raise ValueError( + "Sparse input is not supported for oblique trees. " + "Please convert your data to a dense array." + ) + else: + PATCH_SPLITTERS = PATCH_DENSE_SPLITTERS + + # compute user-defined Kernel library before + kernel_library, kernel_dims, kernel_params = self._sample_kernel_library(X, y) + + # Note: users cannot define a splitter themselves + splitter = PATCH_SPLITTERS[self.splitter]( + criterion, + self.max_features_, + min_samples_leaf, + min_weight_leaf, + random_state, + self.min_patch_dims_, + self.max_patch_dims_, + self.dim_contiguous_, + self.data_dims_, + self.boundary, + self.feature_weight, + kernel_library, + ) + + self.tree_ = ObliqueTree(self.n_features_in_, self.n_classes_, self.n_outputs_) + + # Use BestFirst if max_leaf_nodes given; use DepthFirst otherwise + if max_leaf_nodes < 0: + builder = DepthFirstTreeBuilder( + splitter, + min_samples_split, + min_samples_leaf, + min_weight_leaf, + max_depth, + self.min_impurity_decrease, + ) + else: + builder = BestFirstTreeBuilder( + splitter, + min_samples_split, + min_samples_leaf, + min_weight_leaf, + max_depth, + max_leaf_nodes, + self.min_impurity_decrease, + ) + + builder.build(self.tree_, X, y, sample_weight) + + # Set some attributes based on what kernel indices were used in each tree + # - construct a tree-nodes like array and set it as a Python attribute, so it is + # exposed to the Python interface + # - use the Cython tree.feature array to store the index of the dictionary library + # that was used to split at each node + kernel_idx_chosen = self.tree_.feature + kernel_lib_chosen = kernel_library[kernel_idx_chosen] + kernel_params_chosen = kernel_params[kernel_idx_chosen] + self.kernel_arr_ = kernel_lib_chosen + self.kernel_dims_ = kernel_dims[kernel_idx_chosen] + self.kernel_params_ = kernel_params_chosen + + if self.store_kernel_library: + self.kernel_library_ = kernel_library + self.kernel_idx_chosen_ = kernel_idx_chosen + + if self.n_outputs_ == 1: + self.n_classes_ = self.n_classes_[0] + self.classes_ = self.classes_[0] + + def _sample_kernel_library(self, X, y, sample_weight): + """Samples the dictionary library that is sampled from in the random forest. + + A kernel can either be applied with the boundaries of the image in mind, such that + the patch can be uniformly applied across all indices of rows of ``X_structured``. This + is equivalent to passing in ``boundary = 'wrap'``. Alternatively, a kernel can be sampled, + such that the patch is always contained within the boundary of the image. This is equivalent + to passing in ``boundary = None``. + + Parameters + ---------- + X : _type_ + _description_ + y : _type_ + _description_ + sample_weight : _type_ + _description_ + + Returns + ------- + kernel_library : array-like of shape (n_kernels, n_features) + The dictionary that will be sampled from the random forest. Non-zero entries indicate + where the kernel is applied. This is a n-dimensional kernel matrix that is vectorized + and placed into the original ``X`` data dimensions of (n_dims,). + kernel_dims : list of (n_kernels,) length + The dimensions of each kernel that is sampled. For example, (n_dims_x, n_dims_y) in + the first element indicates that the first kernel has a shape of (n_dims_x, n_dims_y). + This can be used to then place where the top-left seed of each kernel is applied. + kernel_params : list of (n_kernels,) length + A list of dictionaries representing the parameters of each kernel. This is used to + keep track of what kernels and parameterizations were valuable when used in the random + forest. + """ + raise NotImplementedError("This method should be implemented in a child class.") + + +class GaussianKernelDecisionTreeClassifier(KernelDecisionTreeClassifier): + _parameter_constraints = { + **KernelDecisionTreeClassifier._parameter_constraints, + # "mu_bounds": ['array-like'], + # "sigma_bounds": ['array-like'], + } + + def __init__( + self, + *, + criterion="gini", + splitter="best", + max_depth=None, + min_samples_split=2, + min_samples_leaf=1, + min_weight_fraction_leaf=0, + max_features=None, + random_state=None, + max_leaf_nodes=None, + min_impurity_decrease=0, + class_weight=None, + min_patch_dims=None, + max_patch_dims=None, + dim_contiguous=None, + data_dims=None, + boundary=None, + feature_weight=None, + n_kernels=None, + store_kernel_library=False, + mu_bounds=(0, 1), + sigma_bounds=(0, 1), + ): + super().__init__( + criterion=criterion, + splitter=splitter, + max_depth=max_depth, + min_samples_split=min_samples_split, + min_samples_leaf=min_samples_leaf, + min_weight_fraction_leaf=min_weight_fraction_leaf, + max_features=max_features, + random_state=random_state, + max_leaf_nodes=max_leaf_nodes, + min_impurity_decrease=min_impurity_decrease, + class_weight=class_weight, + min_patch_dims=min_patch_dims, + max_patch_dims=max_patch_dims, + dim_contiguous=dim_contiguous, + data_dims=data_dims, + boundary=boundary, + feature_weight=feature_weight, + kernel="gaussian", + n_kernels=n_kernels, + store_kernel_library=store_kernel_library, + ) + self.mu_bounds = mu_bounds + self.sigma_bounds = sigma_bounds + + def _sample_kernel_library(self, X, y, sample_weight): + """Samples a gaussian kernel library.""" + rng = np.random.default_rng(self.random_state) + kernel_library = [] + kernel_params = [] + kernel_dims = [] + + # Sample the kernel library + ndim = len(self.data_dims_) + for _ in range(self.n_kernels): + patch_shape = [] + for idx in range(ndim): + # Note: By constraining max patch height/width to be at least the min + # patch height/width this ensures that the minimum value of + # patch_height and patch_width is 1 + patch_dim = rng.randint(self.min_patch_dims_[idx], self.max_patch_dims_[idx] + 1) + + # sample the possible patch shape given input parameters + if self.boundary == "wrap": + # add circular boundary conditions + delta_patch_dim = self.data_dims_[idx] + 2 * (patch_dim - 1) + + # sample the top left index for this dimension + top_left_patch_seed = rng.randint(0, delta_patch_dim) + + # resample the patch dimension due to padding + dim = top_left_patch_seed % delta_patch_dim + + # resample the patch dimension due to padding + patch_dim = min( + patch_dim, min(dim + 1, self.data_dims_[idx] + patch_dim - dim - 1) + ) + + patch_shape.append(patch_dim) + + patch_shape = np.array(patch_shape) + + # sample the sigma and mu parameters + sigma = rng.uniform(low=self.mu_bounds[0], high=self.mu_bounds[1]) + mu = rng.uniform(low=self.sigma_bounds[0], high=self.sigma_bounds[1]) + + kernel = gaussian_kernel(shape=patch_shape, sigma=sigma, mu=mu) + + kernel_dims.append(kernel.shape) + kernel_library.append(kernel) + kernel_params.append({"shape": patch_shape, "sigma": sigma, "mu": mu}) + + return kernel_library, kernel_dims, kernel_params diff --git a/sktree/tree/_oblique_splitter.pyx b/sktree/tree/_oblique_splitter.pyx index e58beeca2..adc6bed9c 100644 --- a/sktree/tree/_oblique_splitter.pyx +++ b/sktree/tree/_oblique_splitter.pyx @@ -161,7 +161,7 @@ cdef class BaseObliqueSplitter(Splitter): # sort samples according to the feature values. for idx in range(start, end): # initialize the feature value to 0 - feature_values[idx] = 0 + feature_values[idx] = 0.0 for jdx in range(0, proj_vec_indices.size()): feature_values[idx] += self.X[ samples[idx], deref(proj_vec_indices)[jdx] diff --git a/sktree/tree/_oblique_tree.pyx b/sktree/tree/_oblique_tree.pyx index 974e0b576..18cafe9ff 100644 --- a/sktree/tree/_oblique_tree.pyx +++ b/sktree/tree/_oblique_tree.pyx @@ -1,6 +1,7 @@ # cython: cdivision=True # cython: boundscheck=False # cython: wraparound=False +# cython: initializedcheck=False # Authors: Adam Li # Chester Huynh diff --git a/sktree/tree/_utils.pyx b/sktree/tree/_utils.pyx index b1be10826..e1ea8e879 100644 --- a/sktree/tree/_utils.pyx +++ b/sktree/tree/_utils.pyx @@ -1,5 +1,5 @@ -# cython: cdivision=True # distutils: language=c++ +# cython: cdivision=True # cython: language_level=3 # cython: boundscheck=False # cython: wraparound=False @@ -10,6 +10,7 @@ cimport numpy as cnp cnp.import_array() + cpdef unravel_index( SIZE_t index, cnp.ndarray[SIZE_t, ndim=1] shape diff --git a/sktree/tree/kernels.py b/sktree/tree/kernels.py new file mode 100644 index 000000000..e2d6b972f --- /dev/null +++ b/sktree/tree/kernels.py @@ -0,0 +1,13 @@ +import numpy as np +from numpy.typing import ArrayLike + + +def gaussian_kernel(shape, sigma: float = 1.0, mu: float = 0.0) -> ArrayLike: + """N-dimensional gaussian kernel for the given shape. + + See: https://gist.github.com/liob/e784775e882b83749cb3bbcef480576e + """ + m = np.meshgrid(*[np.linspace(-1, 1, s) for s in shape]) + d = np.sqrt(np.sum([x * x for x in m], axis=0)) + g = np.exp(-((d - mu) ** 2 / (2.0 * sigma**2))) + return g / np.sum(g) diff --git a/sktree/tree/manifold/_morf_splitter.pxd b/sktree/tree/manifold/_morf_splitter.pxd index 58c52d57a..f114b68bd 100644 --- a/sktree/tree/manifold/_morf_splitter.pxd +++ b/sktree/tree/manifold/_morf_splitter.pxd @@ -1,8 +1,6 @@ # distutils: language = c++ # Authors: Adam Li -# Chester Huynh -# Parth Vora # # License: BSD 3 clause @@ -33,6 +31,13 @@ from .._oblique_splitter cimport BaseObliqueSplitter, ObliqueSplitRecord # discrete_distribution(T first, T last) except + # operator()(&G) except + +# XXX: replace with from libcpp.algorithm cimport swap +# when Cython 3.0 is released +cdef extern from "" namespace "std" nogil: + void swap[T](T& a, T& b) except + # array overload also works + +ctypedef DTYPE_t* DTYPE_t_ptr +ctypedef SIZE_t* SIZE_t_ptr cdef class PatchSplitter(BaseObliqueSplitter): # The PatchSplitter creates candidate feature values by sampling 2D patches from @@ -82,19 +87,6 @@ cdef class PatchSplitter(BaseObliqueSplitter): cdef class UserKernelSplitter(PatchSplitter): """A class to hold user-specified kernels.""" - cdef vector[DTYPE_t[:, ::1]] kernel_dictionary # A list of C-contiguous 2D kernels - - -cdef class GaussianKernelSplitter(PatchSplitter): - """A class to hold Gaussian kernels. - Overrides the weights that are generated to be sampled from a Gaussian distribution. - See: https://www.tutorialspoint.com/gaussian-filter-generation-in-cplusplus - See: https://gist.github.com/thomasaarholt/267ec4fff40ca9dff1106490ea3b7567 - """ - - cdef void sample_proj_mat( - self, - vector[vector[DTYPE_t]]& proj_mat_weights, - vector[vector[SIZE_t]]& proj_mat_indices - ) nogil + cdef vector[DTYPE_t*] kernel_dictionary # A list of C-contiguous 2D kernels + cdef vector[SIZE_t*] kernel_dims # A list of arrays storing the dimensions of each kernel in `kernel_dictionary` diff --git a/sktree/tree/manifold/_morf_splitter.pyx b/sktree/tree/manifold/_morf_splitter.pyx index b86e6e74a..45e4f7116 100644 --- a/sktree/tree/manifold/_morf_splitter.pyx +++ b/sktree/tree/manifold/_morf_splitter.pyx @@ -367,11 +367,25 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): if self.dim_contiguous[dim_idx] is True: continue + # determine the "row" we are currently on + # other_dims_offset = 1 + # for idx in range(dim_idx + 1, self.ndim): + # other_dims_offset *= self.data_dims[idx] + # row_index = self.unraveled_patch_point[dim_idx] % other_dims_offset # determine the "row" we are currently on other_dims_offset = 1 for idx in range(dim_idx + 1, self.ndim): - other_dims_offset *= self.data_dims[idx] - row_index = self.unraveled_patch_point[dim_idx] % other_dims_offset + if not self.dim_contiguous[idx]: + other_dims_offset *= self.data_dims[idx] + + row_index = 0 + for idx in range(dim_idx + 1, self.ndim): + if not self.dim_contiguous[idx]: + row_index += ( + (self.unraveled_patch_point[idx] // other_dims_offset) % + self.patch_dims_buff[idx] + ) * other_dims_offset + other_dims_offset //= self.data_dims[idx] # assign random row index now self.unraveled_patch_point[dim_idx] = self._index_patch_buffer[row_index] @@ -420,7 +434,6 @@ cdef class BestPatchSplitter(BaseDensePatchSplitter): if self.feature_weight is not None: feature_values[idx] /= patch_weight - cdef class BestPatchSplitterTester(BestPatchSplitter): """A class to expose a Python interface for testing.""" cpdef sample_top_left_seed_cpdef(self): @@ -499,3 +512,113 @@ cdef class BestPatchSplitterTester(BestPatchSplitter): Sample weights. """ self.init(X, y, sample_weight) + +cdef class UserKernelSplitter(PatchSplitter): + def __cinit__( + self, + criterion: Criterion, + max_features: SIZE_t, + min_samples_leaf: SIZE_t, + min_weight_leaf: double, + random_state: object, + min_patch_dims: SIZE_t, + max_patch_dims: SIZE_t, + dim_contiguous: cnp.uint8_t, + data_dims: SIZE_t, + boundary: str, + feature_weight: DTYPE_t, + kernel_dictionary: object, + kernel_dims: object, + *argv + ): + # initialize the kernel dictionary into a vector to allow Cython to see it + # see: https://stackoverflow.com/questions/46240207/passing-list-of-numpy-arrays-to-c-using-cython + cdef int n_arrays = len(kernel_dictionary) + self.kernel_dictionary = vector[DTYPE_t_ptr](n_arrays) # A list of C-contiguous 2D kernels + self.kernel_dims = vector[SIZE_t_ptr](n_arrays) # A list of arrays storing the dimensions of each kernel in `kernel_dictionary` + + # buffers to point to each element in the list + cdef DTYPE_t[:] kernel + cdef SIZE_t[:] kernel_dim + + cdef int i + for i in range(n_arrays): + kernel = kernel_dictionary[i] + kernel_dim = kernel_dims[i] + + # store a pointer to the data + self.kernel_dictionary.push_back(&kernel[0]) + self.kernel_dims.push_back(&kernel_dim[0]) + + cdef void sample_proj_mat( + self, + vector[vector[DTYPE_t]]& proj_mat_weights, + vector[vector[SIZE_t]]& proj_mat_indices + ) noexcept nogil: + """Sample projection matrix using a contiguous patch. + + Randomly sample patches with weight of 1. + """ + cdef SIZE_t max_features = self.max_features + cdef int proj_i + + # define parameters for vectorized points in the original data shape + # and top-left seed + cdef SIZE_t top_left_patch_seed + + # size of the sampled patch, which is just the size of the n-dim patch + # (\prod_i self.patch_dims_buff[i]) + cdef SIZE_t patch_size + + cdef DTYPE_t[:] kernel + cdef SIZE_t[:] kernel_dim + + for proj_i in range(0, max_features): + # now get the top-left seed that is used to then determine the top-left + # position in patch + # compute top-left seed for the multi-dimensional patch + top_left_patch_seed, patch_size = self.sample_top_left_seed() + + # sample a random index in the kernel library + # kernel_idx = + + # get that kernel and add it to the projection vector indices and weights + kernel = self.kernel_dictionary[kernel_idx] + kernel_dim = self.kernel_dims[kernel_idx] + + # convert top-left-patch-seed to unraveled indices + # get the top-left index in the original data + top_left_idx = self.unravel_index(top_left_patch_seed, self.data_dims_buff, self.ndim) + + # loop over the kernel and add the weights and indices to the projection + for idim in range(self.ndim): + # get the dimension of the kernel + kernel_dim = self.kernel_dims[kernel_idx][idim] + + # get the top-left index in the kernel + top_left_kernel_idx = self.unravel_index(top_left_patch_seed, kernel_dim, self.ndim) + + # loop over the kernel and add the weights and indices to the projection + # for i in range(kernel_dim): + # # get the index in the original data + # idx = self.ravel_multi_index(top_left_idx, self.data_dims_buff, self.ndim) + + # # get the index in the kernel + # kernel_idx = self.ravel_multi_index(top_left_kernel_idx, kernel_dim, self.ndim) + + # # add the weight and index to the projection matrix + # proj_mat_weights[proj_i].push_back(kernel[kernel_idx]) + # proj_mat_indices[proj_i].push_back(idx) + + # # increment the top-left index in the original data + # top_left_idx[idim] += 1 + + # # increment the top-left index in the kernel + # top_left_kernel_idx[idim] += 1 + + # # increment the top-left index in the original data + # top_left_idx[idim] += self.patch_dims_buff[idim] - kernel_dim + + # # increment the top-left index in the kernel + # top_left_kernel_idx[idim] += self.patch_dims_buff[idim] - kernel_dim + \ No newline at end of file diff --git a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx index 3c882995a..fb66ac752 100644 --- a/sktree/tree/unsupervised/_unsup_oblique_tree.pyx +++ b/sktree/tree/unsupervised/_unsup_oblique_tree.pyx @@ -1,6 +1,7 @@ # cython: cdivision=True # cython: boundscheck=False # cython: wraparound=False +# cython: initializedcheck=False # Authors: Adam Li # diff --git a/sktree/tree/unsupervised/_unsup_tree.pyx b/sktree/tree/unsupervised/_unsup_tree.pyx index 5946a8f77..44f2203e5 100644 --- a/sktree/tree/unsupervised/_unsup_tree.pyx +++ b/sktree/tree/unsupervised/_unsup_tree.pyx @@ -3,7 +3,6 @@ # cython: wraparound=False, initializedcheck=False, cdivision=True # Authors: Adam Li -# Jong Shin # # License: BSD 3 clause