Merge branch 'main' into honestoblique_yuxin

PSSF23 · web-flow · commit 1b3c31542123 · 2025-02-24T12:11:27.000-05:00
diff --git a/meson.build b/meson.build
@@ -4,7 +4,7 @@ project(
   # Note that the git commit hash cannot be added dynamically here
   # That only happens when importing from a git repository.
   # See `treeple/__init__.py`
-  version: '0.10.0.dev0',
+  version: '0.10.3',
   license: 'PolyForm Noncommercial 1.0.0',
   meson_version: '>= 1.1.0',
   default_options: [
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,14 +10,14 @@ requires = [
   "setuptools<=65.5",
   "packaging",
   "Cython>=3.0.10",
-  "scikit-learn>=1.5.0",
+  "scikit-learn>=1.6.0",
   "scipy>=1.5.0",
   "numpy>=1.25; python_version>='3.9'"
 ]
 
 [project]
 name = "treeple"
-version = "0.10.0.dev0"
+version = "0.10.3"
 description = "Modern decision trees in Python"
 maintainers = [
   {name = "Neurodata", email = "adam.li@columbia.edu"}
@@ -52,7 +52,7 @@ include = [
 dependencies = [
   'numpy>=1.25.0',
   'scipy>=1.5.0',
-  'scikit-learn>=1.5.0'
+  'scikit-learn>=1.6.0'
 ]
 
 [project.optional-dependencies]
@@ -70,7 +70,7 @@ build = [
   'meson-python',
   'spin>=0.12',
   'doit',
-  'scikit-learn>=1.5.0',
+  'scikit-learn>=1.6.0',
   'Cython>=3.0.10',
   'ninja',
   'numpy>=1.25.0',
diff --git a/treeple/__init__.py b/treeple/__init__.py
@@ -4,7 +4,7 @@
 import os
 import sys
 
-__version__ = "0.10.0dev0"
+__version__ = "0.10.3"
 logger = logging.getLogger(__name__)
 
 
diff --git a/treeple/ensemble/_honest_forest.py b/treeple/ensemble/_honest_forest.py
@@ -182,7 +182,7 @@ class HonestForestClassifier(ForestClassifier, ForestClassifierMixin):
         ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
         if ``sample_weight`` is passed.
 
-    bootstrap : bool, default=False
+    bootstrap : bool, default=True
         Whether bootstrap samples are used when building trees. If False, the
         whole dataset is used to build each tree.
 
@@ -270,24 +270,30 @@ class HonestForestClassifier(ForestClassifier, ForestClassifierMixin):
         Fraction of training samples used for estimates in the trees. The
         remaining samples will be used to learn the tree structure. A larger
         fraction creates shallower trees with lower variance estimates.
-    
+
     honest_method : {"prune", "apply"}, default="prune"
         Method for enforcing honesty. If "prune", the tree is pruned to enforce
         honesty. If "apply", the tree is not pruned, but the leaf estimates are
         adjusted to enforce honesty.
 
+    kernel_method : bool, default=True
+        Method for normalizing ``predict_proba`` posteriors by the number of
+        samples in the leaf nodes across the forest. Contrary to the average of
+        posteriors, the kernel method only normalizes the probabilities once.
+        By default True.
+
     tree_estimator : object, default=None
         Instantiated tree of type BaseDecisionTree from treeple.
         If None, then sklearn's DecisionTreeClassifier with default parameters will
         be used. Note that none of the parameters in ``tree_estimator`` need
         to be set. The parameters of the ``tree_estimator`` can be set using
         the ``tree_estimator_params`` keyword argument.
 
-    stratify : bool
+    stratify : bool, default=True
         Whether or not to stratify sample when considering structure and leaf indices.
         This will also stratify samples when bootstrap sampling is used. For more
         information, see :func:`sklearn.utils.resample`.
-        By default False.
+        By default True.
 
     **tree_estimator_params : dict
         Parameters to pass to the underlying base tree estimators.
@@ -462,12 +468,13 @@ def __init__(
         warm_start=False,
         class_weight=None,
         ccp_alpha=0.0,
-        max_samples=None,
+        max_samples=1.6,
         honest_prior="ignore",
         honest_fraction=0.5,
-        honest_method="apply",
+        honest_method="prune",
+        kernel_method=True,
         tree_estimator=None,
-        stratify=False,
+        stratify=True,
         **tree_estimator_params,
     ):
         super().__init__(
@@ -490,6 +497,7 @@ def __init__(
                 "honest_prior",
                 "honest_method",
                 "stratify",
+                "kernel_method",
             ),
             bootstrap=bootstrap,
             oob_score=oob_score,
@@ -513,7 +521,7 @@ def __init__(
         self.honest_fraction = honest_fraction
         self.honest_prior = honest_prior
         self.honest_method = honest_method
-        print(self.honest_method)
+        self.kernel_method = kernel_method
         self.tree_estimator = tree_estimator
         self.stratify = stratify
         self._tree_estimator_params = tree_estimator_params
diff --git a/treeple/tree/_honest_tree.py b/treeple/tree/_honest_tree.py
@@ -186,9 +186,9 @@ class HonestTreeClassifier(MetaEstimatorMixin, ClassifierMixin, BaseDecisionTree
         classes). If "empirical", the prior tree posterior is the relative
         class frequency in the voting subsample.
 
-    stratify : bool
+    stratify : bool, default=True
         Whether or not to stratify sample when considering structure and leaf indices.
-        By default False.
+        By default True.
 
     honest_method : {"apply", "prune"}, default="apply"
         Method to use for fitting the leaf nodes. If "apply", the leaf nodes
@@ -197,6 +197,12 @@ class frequency in the voting subsample.
         by pruning using the honest-set of data after the tree structure is built
         using the structure-set of data.
 
+    kernel_method : bool, default=False
+        Method for normalizing ``predict_proba`` posteriors by the number of
+        samples in the leaf nodes across the forest. Not applicalble to single
+        honest trees.
+        By default False.
+
     **tree_estimator_params : dict
         Parameters to pass to the underlying base tree estimators.
         These must be parameters for ``tree_estimator``.
@@ -338,8 +344,9 @@ def __init__(
         monotonic_cst=None,
         honest_fraction=0.5,
         honest_prior="empirical",
-        stratify=False,
+        stratify=True,
         honest_method="apply",
+        kernel_method=False,
         **tree_estimator_params,
     ):
         self.tree_estimator = tree_estimator
@@ -361,6 +368,7 @@ def __init__(
         self.honest_prior = honest_prior
         self.stratify = stratify
         self.honest_method = honest_method
+        self.kernel_method = kernel_method
 
         # XXX: to enable this, we need to also reset the leaf node samples during `_set_leaf_nodes`
         self.store_leaf_values = False
@@ -876,9 +884,11 @@ class in a leaf.
 
         if self.n_outputs_ == 1:
             proba = proba[:, : self._tree_n_classes_]
-            # normalizer = proba.sum(axis=1)[:, np.newaxis]
-            # normalizer[normalizer == 0.0] = 1.0
-            # proba /= normalizer
+
+            if not self.kernel_method:
+                normalizer = proba.sum(axis=1)[:, np.newaxis]
+                normalizer[normalizer == 0.0] = 1.0
+                proba /= normalizer
             proba = self._empty_leaf_correction(proba)
 
             return proba
@@ -888,10 +898,13 @@ class in a leaf.
 
             for k in range(self.n_outputs_):
                 proba_k = proba[:, k, : self._tree_n_classes_[k]]
-                normalizer = proba_k.sum(axis=1)[:, np.newaxis]
-                # normalizer[normalizer == 0.0] = 1.0
-                # proba_k /= normalizer
-                # proba_k = self._empty_leaf_correction(proba_k, k)
+
+                if not self.kernel_method:
+                    normalizer = proba_k.sum(axis=1)[:, np.newaxis]
+                    normalizer[normalizer == 0.0] = 1.0
+                    proba_k /= normalizer
+                    proba_k = self._empty_leaf_correction(proba_k, k)
+
                 all_proba.append(proba_k)
 
             return all_proba
diff --git a/treeple/tree/honesty/_honest_prune.pyx b/treeple/tree/honesty/_honest_prune.pyx
@@ -152,7 +152,11 @@ cdef class HonestPruner(Splitter):
                     self.samples[current_end], self.samples[p]
                 n_missing += 1
                 current_end -= 1
-            elif p > pos and (self.tree._compute_feature(X_ndarray, sample_idx, &self.tree.nodes[node_idx]) <= threshold):
+
+            # Leverage sklearn's forked API to compute the feature value at this split node
+            # and then compare that to the corresponding threshold
+            # Note: this enables the function to work w/ both axis-aligned and oblique splits.
+            elif p > pos and (self.tree._compute_feature(X_ndarray, sample_idx, &self.tree.nodes[node_idx])<= threshold):
                 self.samples[p], self.samples[pos] = \
                     self.samples[pos], self.samples[p]
                 pos += 1
@@ -367,8 +371,12 @@ cdef _honest_prune(
             split_is_degenerate = (
                 pruner.n_left_samples() == 0 or pruner.n_right_samples() == 0
             )
+
             is_leaf_in_origtree = child_l[node_idx] == _TREE_LEAF
+
             if invalid_split or split_is_degenerate or is_leaf_in_origtree:
+                # invalid_split or is_leaf_in_origtree:
+                # or split_is_degenerate or is_leaf_in_origtree:
                 # ... and child_r[node_idx] == _TREE_LEAF:
                 #
                 # 1) if node is not degenerate, that means there are still honest-samples in