diff --git a/.circleci/config.yml b/.circleci/config.yml index 8f5c98550..6d76eb045 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -84,7 +84,6 @@ jobs: pip install --upgrade pip spin spin setup-submodule pip install .[build,doc] - pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple scikit-learn --force - run: name: build treeple diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 492f70850..c4cb3b447 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -85,7 +85,7 @@ jobs: pip install -r test_requirements.txt - name: Install nightly wheels for scikit-learn (only for ubuntu 3.12) - if: ${{ matrix.python-version == '3.12' }} && ${{ matrix.os == 'ubuntu-latest' }} + if: ${{ matrix.python-version == '3.12' && matrix.os == 'ubuntu-latest' }} run: | pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple scikit-learn --force @@ -189,7 +189,6 @@ jobs: pip install compilers pip install -r build_requirements.txt pip install -r test_requirements.txt - pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple scikit-learn --force - name: Prepare compiler cache id: prep-ccache @@ -285,7 +284,6 @@ jobs: pip install spin pip install -r build_requirements.txt pip install -r test_requirements.txt - pip install --pre --extra-index https://pypi.anaconda.org/scientific-python-nightly-wheels/simple scikit-learn --force - name: Build run: | diff --git a/build_requirements.txt b/build_requirements.txt index ec63cfb3b..90af56671 100644 --- a/build_requirements.txt +++ b/build_requirements.txt @@ -3,7 +3,7 @@ meson-python>=0.16.0 cython>=3.0.10 ninja numpy -scikit-learn>=1.5.0 +scikit-learn~=1.6.0 click rich-click doit diff --git a/pyproject.toml b/pyproject.toml index 2bb8c6a23..543e0e767 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ requires = [ "setuptools<=65.5", "packaging", "Cython>=3.0.10", - "scikit-learn>=1.6.0", + "scikit-learn~=1.6.0", "scipy>=1.5.0", "numpy>=1.25; python_version>='3.9'" ] diff --git a/treeple/_lib/sklearn_fork b/treeple/_lib/sklearn_fork index 4fd15fdf8..0e43e917a 160000 --- a/treeple/_lib/sklearn_fork +++ b/treeple/_lib/sklearn_fork @@ -1 +1 @@ -Subproject commit 4fd15fdf88737e7e84e96217b2c9b0ce0c162c2c +Subproject commit 0e43e917a6734fc61a8c9999bc4b4a563476ec58 diff --git a/treeple/datasets/hyppo.py b/treeple/datasets/hyppo.py index 781c124ab..6efdc6c44 100644 --- a/treeple/datasets/hyppo.py +++ b/treeple/datasets/hyppo.py @@ -562,7 +562,6 @@ def make_trunk_classification( ) y = np.concatenate((np.zeros(n_samples // 2), np.ones(n_samples // 2))) - if return_params: return [X, y, [mu_0_vec, mu_1_vec], [cov, cov]] return X, y diff --git a/treeple/tree/_oblique_splitter.pyx b/treeple/tree/_oblique_splitter.pyx index 0cceac664..72184cb09 100644 --- a/treeple/tree/_oblique_splitter.pyx +++ b/treeple/tree/_oblique_splitter.pyx @@ -11,7 +11,7 @@ from libcpp.vector cimport vector from .._lib.sklearn.tree._criterion cimport Criterion from .._lib.sklearn.tree._utils cimport rand_int, rand_uniform -from ._utils cimport fisher_yates_shuffle +from ._utils cimport floyd_sample_indices cdef float64_t INFINITY = np.inf @@ -194,9 +194,8 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): self.X = X - # create a helper array for allowing efficient Fisher-Yates - self.indices_to_sample = np.arange(self.max_features * self.n_features, - dtype=np.intp) + # create a helper array for allowing efficient Fisher-Yates/ Floyd's method + self.indices_to_sample = np.zeros(self.n_non_zeros, dtype=np.intp) # XXX: Just to initialize stuff # self.feature_weights = np.ones((self.n_features,), dtype=float32_t) / self.n_features @@ -238,8 +237,8 @@ cdef class ObliqueSplitter(BaseObliqueSplitter): cdef intp_t[::1] indices_to_sample = self.indices_to_sample cdef intp_t grid_size = self.max_features * self.n_features - # shuffle indices over the 2D grid to sample using Fisher-Yates - fisher_yates_shuffle(indices_to_sample, grid_size, random_state) + # draw n_non_zeros random indices from the mTry x n_features set of indices + floyd_sample_indices(indices_to_sample, n_non_zeros, grid_size, random_state) # sample 'n_non_zeros' in a mtry X n_features projection matrix # which consists of +/- 1's chosen at a 1/2s rate diff --git a/treeple/tree/_utils.pxd b/treeple/tree/_utils.pxd index ba2707791..1810596ea 100644 --- a/treeple/tree/_utils.pxd +++ b/treeple/tree/_utils.pxd @@ -22,6 +22,14 @@ cdef void fisher_yates_shuffle( ) noexcept nogil +cdef void floyd_sample_indices( + intp_t[::1] out, + intp_t k, + intp_t n, + uint32_t* random_state +) noexcept nogil + + cdef int rand_weighted_binary( float64_t p0, uint32_t* random_state diff --git a/treeple/tree/_utils.pyx b/treeple/tree/_utils.pyx index 7ce48977b..be1df9ab1 100644 --- a/treeple/tree/_utils.pyx +++ b/treeple/tree/_utils.pyx @@ -11,6 +11,8 @@ cimport numpy as cnp cnp.import_array() +from libcpp.unordered_set cimport unordered_set + from .._lib.sklearn.tree._utils cimport rand_int, rand_uniform @@ -41,6 +43,39 @@ cdef inline void fisher_yates_shuffle( indices_to_sample[i], indices_to_sample[j] +cdef inline void floyd_sample_indices( + intp_t[::1] out, + intp_t k, + intp_t n, + uint32_t* random_state +) noexcept nogil: + """ + Rober Floyd's algorithm for sampling without replacement + + Parameters + ---------- + out : intp_t[::1] + Output memoryview where the sampled integers are stored. + k : intp_t + Number of samples to draw. + n : intp_t + Size of the domain to sample from + random_state : uint32_t* + The random state. + """ + cdef unordered_set[intp_t] seen + cdef intp_t i, r = 0 + + for i in range(n - k, n): + r = rand_int(0, i + 1, random_state) + if seen.find(r) == seen.end(): + seen.insert(r) + out[i - n + k] = r + else: + seen.insert(i) + out[i - n + k] = i + + cdef inline int rand_weighted_binary( float64_t p0, uint32_t* random_state