fixing bugs

benja263 · benja263 · commit ec976e6f0ff9 · 2026-02-02T18:25:51.000+02:00
diff --git a/gbrl/common/compression.py b/gbrl/common/compression.py
@@ -227,6 +227,9 @@ def __init__(self, k: int, gradient_steps: int, n_trees: int, n_leaves_per_tree:
         self.method = method
         self.use_W = use_W
         if list(self.compression.parameters()):
+            # Guard against None optimizer_kwargs
+            if optimizer_kwargs is None:
+                optimizer_kwargs = {}
             self.optimizer = optimizer_class(self.compression.parameters(), **optimizer_kwargs)
         self.gradient_steps = gradient_steps
 
@@ -413,7 +416,7 @@ def compress(self, A: th.Tensor, V: th.Tensor, actions: th.Tensor,
             self.optimizer.zero_grad()
             loss.backward()
             self.optimizer.step()
-            losses.append(loss)
+            losses.append(loss.item())
             print(f"{i + 1}/{self.gradient_steps} - compression loss: {loss.item()}")
         return self.compression.get_parameters(A, V), losses
 
diff --git a/gbrl/learners/actor_critic_learner.py b/gbrl/learners/actor_critic_learner.py
@@ -203,11 +203,13 @@ def compress(self, trees_to_keep: int, gradient_steps: int, features: NumericalD
             trees_to_keep (int): Number of trees to retain in the compressed model.
             gradient_steps (int): Number of optimization steps during compression.
             features (NumericalData): Input feature matrix (n_samples, n_features).
-            actions (th.Tensor, optional): Target actions (for policy compression). Required if dist_type
-                is not 'supervised_learning'.
+            actions (th.Tensor, optional): Target actions (for policy compression). Required
+                unless dist_type is 'deterministic' or 'supervised_learning'.
             log_std (th.Tensor, optional): Log standard deviation (only used for certain policy types).
             method (str): Tree selection method. Defaults to 'first_k'.
-            dist_type (str): Compression type ('supervised_learning', 'actor', etc.).
+            dist_type (str): Compression type. Supported: 'deterministic', 'supervised_learning',
+                'categorical', 'gaussian'. For 'deterministic' and 'supervised_learning', actions
+                are not required.
             optimizer_kwargs (dict, optional): Optimizer configuration.
             temperature (float): Temperature parameter for soft selection.
             lambda_reg (float): L2 regularization coefficient on weights.
@@ -216,9 +218,11 @@ def compress(self, trees_to_keep: int, gradient_steps: int, features: NumericalD
         Returns:
             float: Final loss value after compression.
         """
-        assert actions is not None, "Cannot compress a shared actor-critic policy without actions"
-        assert dist_type != 'supervised_learning', \
-            "Cannot compress a shared actor-critic policy using supervised learning compression methods"
+        # Actions are only required for probabilistic policy compression
+        if dist_type not in {'deterministic', 'supervised_learning'}:
+            assert actions is not None, \
+                f"Cannot compress with dist_type='{dist_type}' without actions. " \
+                "Actions are required for probabilistic policy compression (categorical, gaussian)."
         
         A, V, n_leaves_per_tree, n_leaves, n_trees = self.get_matrix_representation(features)
         # Convert to tensors with explicit dtypes
@@ -255,8 +259,10 @@ def compress(self, trees_to_keep: int, gradient_steps: int, features: NumericalD
         # Compute new tree indices for compressed model
         new_tree_indices = np.zeros(n_compressed_trees, dtype=np.int32)
         if n_compressed_trees > 1:
+            # Convert tensor to CPU numpy before indexing with numpy array
+            n_leaves_per_tree_np = n_leaves_per_tree.cpu().numpy()
             new_tree_indices[1:] = np.cumsum(
-                n_leaves_per_tree[compressed_tree_indices].cpu().numpy()
+                n_leaves_per_tree_np[compressed_tree_indices]
             )[:-1].astype(np.int32)
         
         self._cpp_model.compress(n_compressed_leaves, n_compressed_trees, compressed_leaf_indices,
diff --git a/gbrl/learners/gbt_learner.py b/gbrl/learners/gbt_learner.py
@@ -658,8 +658,10 @@ def compress(self, trees_to_keep: int, gradient_steps: int, features: NumericalD
         # Compute new tree indices for compressed model
         new_tree_indices = np.zeros(n_compressed_trees, dtype=np.int32)
         if n_compressed_trees > 1:
+            # Convert tensor to CPU numpy before indexing with numpy array
+            n_leaves_per_tree_np = n_leaves_per_tree.cpu().numpy()
             new_tree_indices[1:] = np.cumsum(
-                n_leaves_per_tree[compressed_tree_indices].cpu().numpy()
+                n_leaves_per_tree_np[compressed_tree_indices]
             )[:-1].astype(np.int32)
 
         self._cpp_model.compress(n_compressed_leaves, n_compressed_trees, compressed_leaf_indices,
diff --git a/gbrl/learners/multi_gbt_learner.py b/gbrl/learners/multi_gbt_learner.py
@@ -922,8 +922,10 @@ def _compress(A, V, n_leaves_per_tree, compression_params, cpp_model, model_idx)
             # Compute new tree indices for compressed model
             new_tree_indices = np.zeros(n_compressed_trees, dtype=np.int32)
             if n_compressed_trees > 1:
+                # Convert tensor to CPU numpy before indexing with numpy array
+                n_leaves_per_tree_np = n_leaves_per_tree.cpu().numpy()
                 new_tree_indices[1:] = np.cumsum(
-                    n_leaves_per_tree[compressed_tree_indices].cpu().numpy()
+                    n_leaves_per_tree_np[compressed_tree_indices]
                 )[:-1].astype(np.int32)
 
             cpp_model.compress(n_compressed_leaves, n_compressed_trees, compressed_leaf_indices,
diff --git a/gbrl/src/cpp/utils.h b/gbrl/src/cpp/utils.h
@@ -136,8 +136,8 @@ void selective_copy(const int num_indices, const int* indices, T* dest, const T*
 void selective_copy_char(const int num_indices, const int* indices, char* dest, const char* src, const int elements_dim);
 
 inline void valid_tree_idx(const int tree_idx, const ensembleMetaData* metadata){
-    if (tree_idx < 0 || tree_idx > metadata->n_trees){
-        std::cerr << "ERROR: invalid tree_idx " << tree_idx << " in ensemble with ntrees = " << metadata->n_trees <<std::endl;
+    if (tree_idx < 0 || tree_idx >= metadata->n_trees){
+        std::cerr << "ERROR: invalid tree_idx " << tree_idx << " in ensemble with ntrees = " << metadata->n_trees << " (valid range: 0 to " << metadata->n_trees - 1 << ")" <<std::endl;
         throw std::runtime_error("Invalid tree index");
     }
 }
diff --git a/gbrl/src/cuda/cuda_compression.cu b/gbrl/src/cuda/cuda_compression.cu
@@ -59,47 +59,66 @@
  */
 void get_matrix_representation_cuda(dataSet *dataset, ensembleMetaData *metadata, ensembleData *edata, SGDOptimizerGPU** opts, const int n_opts, matrixRepresentation *matrix){
     int n_samples = dataset->n_samples;
+    int output_dim = metadata->output_dim;
     float *device_batch_obs;
     char *device_batch_cat_obs;
     char *device_data;
     float *device_V;
     bool *device_A;
     // assuming row-major order
     size_t A_size = dataset->n_samples * (metadata->n_leaves+1) * sizeof(bool);
-    size_t V_size = (metadata->n_leaves+1) * metadata->output_dim * sizeof(float);
+    size_t V_size = (metadata->n_leaves+1) * output_dim * sizeof(float);
     size_t obs_matrix_size = dataset->n_samples * metadata->n_num_features * sizeof(float);
     size_t cat_obs_matrix_size = dataset->n_samples * metadata->n_cat_features * sizeof(char) * MAX_CHAR_SIZE;
-    cudaError_t alloc_error  = allocateCudaMemory((void**)&device_data, obs_matrix_size + cat_obs_matrix_size + A_size + V_size, "when trying to allocate matrix representation");
+    
+    // Calculate allocation size based on what data is already on device
+    size_t extra_alloc_size = 0;
+    bool obs_on_device = (dataset->obs != nullptr && dataset->obs->data != nullptr && dataset->obs->device != cpu);
+    bool cat_on_device = (dataset->categorical_obs != nullptr && dataset->categorical_obs->data != nullptr && dataset->categorical_obs->device != cpu);
+    
+    if (!obs_on_device) extra_alloc_size += obs_matrix_size;
+    if (!cat_on_device) extra_alloc_size += cat_obs_matrix_size;
+    
+    cudaError_t alloc_error = allocateCudaMemory((void**)&device_data, extra_alloc_size + A_size + V_size, "when trying to allocate matrix representation");
     if (alloc_error != cudaSuccess) {
         return;
     }
-
-    // Allocate host buffer
-    char* host_data = new char[obs_matrix_size + cat_obs_matrix_size + A_size + V_size];
-    memset(host_data, 0, obs_matrix_size + cat_obs_matrix_size + A_size + V_size);
-    // Copy data into host buffer
-    if (dataset->obs != nullptr && dataset->obs->data != nullptr) {
-        std::memcpy(host_data, dataset->obs->data, obs_matrix_size);
-    }
-    if (dataset->categorical_obs != nullptr && dataset->categorical_obs->data != nullptr) {
-        std::memcpy(host_data + obs_matrix_size + V_size + A_size, dataset->categorical_obs->data, cat_obs_matrix_size);
-    }
-    
-    cudaMemcpy(device_data, host_data, obs_matrix_size + cat_obs_matrix_size + A_size + V_size, cudaMemcpyHostToDevice);
-    delete[] host_data;
+    cudaMemset(device_data, 0, extra_alloc_size + A_size + V_size);
 
     size_t trace = 0;
-    device_batch_obs = (float*)device_data;
-    trace += obs_matrix_size;
     device_V = (float *)(device_data + trace);
     trace += V_size;
     device_A = (bool *)(device_data + trace);
     trace += A_size;
-    device_batch_cat_obs = (char *)(device_data + trace);
+    
+    // Handle obs data - device-aware copy
+    if (dataset->obs != nullptr && dataset->obs->data != nullptr) {
+        if (obs_on_device) {
+            device_batch_obs = const_cast<float*>(dataset->obs->data);
+        } else {
+            device_batch_obs = (float*)(device_data + trace);
+            trace += obs_matrix_size;
+            cudaMemcpy(device_batch_obs, dataset->obs->data, obs_matrix_size, cudaMemcpyHostToDevice);
+        }
+    } else {
+        device_batch_obs = nullptr;
+    }
+    
+    // Handle categorical obs data - device-aware copy
+    if (dataset->categorical_obs != nullptr && dataset->categorical_obs->data != nullptr) {
+        if (cat_on_device) {
+            device_batch_cat_obs = const_cast<char*>(dataset->categorical_obs->data);
+        } else {
+            device_batch_cat_obs = (char*)(device_data + trace);
+            cudaMemcpy(device_batch_cat_obs, dataset->categorical_obs->data, cat_obs_matrix_size, cudaMemcpyHostToDevice);
+        }
+    } else {
+        device_batch_cat_obs = nullptr;
+    }
     
     int n_blocks, threads_per_block;
     get_grid_dimensions(dataset->n_samples, n_blocks, threads_per_block);
-    cudaMemcpy(device_V, edata->bias, sizeof(float)*metadata->output_dim, cudaMemcpyDeviceToDevice);
+    cudaMemcpy(device_V, edata->bias, sizeof(float)*output_dim, cudaMemcpyDeviceToDevice);
     
     if (n_opts == 0){
         std::cerr << "No optimizers." << std::endl;
@@ -133,11 +152,14 @@ void get_matrix_representation_cuda(dataSet *dataset, ensembleMetaData *metadata
     n_blocks = metadata->n_leaves / THREADS_PER_BLOCK + 1; 
     get_V_kernel<<<n_blocks, THREADS_PER_BLOCK>>>(device_V, edata->leaf_data->values, opts, n_opts, metadata->output_dim, metadata->n_leaves);
     cudaDeviceSynchronize();
-    matrix->A = new bool[A_size];
+    // Allocate by element count, not byte size
+    int A_elems = n_samples * (metadata->n_leaves + 1);
+    int V_elems = (metadata->n_leaves + 1) * output_dim;
+    matrix->A = new bool[A_elems];
     cudaMemcpy(matrix->A, device_A, A_size, cudaMemcpyDeviceToHost);
     for (int i = 0; i < n_samples; i++)
         matrix->A[i*(metadata->n_leaves + 1)] = true;
-    matrix->V = new float[V_size];
+    matrix->V = new float[V_elems];
     cudaMemcpy(matrix->V, device_V, V_size, cudaMemcpyDeviceToHost);
     // Copy results back to CPU
     matrix->n_leaves = metadata->n_leaves;
diff --git a/gbrl/src/cuda/cuda_fitter.cu b/gbrl/src/cuda/cuda_fitter.cu
@@ -682,10 +682,14 @@ __global__ void split_score_l2_cuda(
                             (direction == -1 && l_val < r_val);
             
             if (violation) {
-                // Pool the means for this output dimension
-                float pooled = (l_val + r_val) * 0.5f;
-                left_mean[out_idx] = pooled;
-                right_mean[out_idx] = pooled;
+                // Pool the means using count-weighted averaging
+                int total_count = l_count[0] + r_count[0];
+                if (total_count > 0) {
+                    float pooled = (l_val * l_count[0] + r_val * r_count[0]) / total_count;
+                    left_mean[out_idx] = pooled;
+                    right_mean[out_idx] = pooled;
+                }
+                // If total_count == 0, keep existing values (no samples to pool)
             }
         }
 
@@ -1875,11 +1879,10 @@ void apply_monotonic_constraints_cuda(
                tree_depth * sizeof(int), 
                cudaMemcpyDeviceToHost);
     
-    // FIX: Copy inequality directions for this tree
+    // FIX: Copy inequality directions for this tree from per-depth base (not per-leaf)
     bool* h_inequality_directions = new bool[tree_depth];
-    int ineq_base = start_leaf_idx * metadata->max_depth;
     cudaMemcpy(h_inequality_directions,
-               edata->feature_data->inequality_directions + ineq_base,
+               edata->feature_data->inequality_directions + tree_idx * metadata->max_depth,
                tree_depth * sizeof(bool),
                cudaMemcpyDeviceToHost);
     
@@ -1915,9 +1918,14 @@ void apply_monotonic_constraints_cuda(
         int constraint_output = h_mono_output_idx[c];
         
         for (int d = 0; d < tree_depth; ++d) {
-            // Convert internal feature index to global using reverse mapping
+            // Convert internal feature index to global using reverse mapping with bounds checks
             int internal_idx = h_feature_indices[d];
+            if (internal_idx < 0 || internal_idx >= metadata->n_num_features) continue;
+            
             int global_idx = h_reverse_mapping[internal_idx];
+            // Total features = n_num_features + n_cat_features
+            int total_features = metadata->n_num_features + metadata->n_cat_features;
+            if (global_idx < 0 || global_idx >= total_features) continue;
             
             if (global_idx == global_feature_idx) {
                 // If inequality_direction is inverted (false), flip the constraint

Original file line number	Diff line number	Diff line change
`@@ -136,8 +136,8 @@ void selective_copy(const int num_indices, const int* indices, T* dest, const T*`
`136`	`136`	`void selective_copy_char(const int num_indices, const int* indices, char* dest, const char* src, const int elements_dim);`
`137`	`137`
`138`	`138`	`inline void valid_tree_idx(const int tree_idx, const ensembleMetaData* metadata){`
`139`		`- if (tree_idx < 0 \|\| tree_idx > metadata->n_trees){`
`140`		`- std::cerr << "ERROR: invalid tree_idx " << tree_idx << " in ensemble with ntrees = " << metadata->n_trees <<std::endl;`
	`139`	`+ if (tree_idx < 0 \|\| tree_idx >= metadata->n_trees){`
	`140`	`+ std::cerr << "ERROR: invalid tree_idx " << tree_idx << " in ensemble with ntrees = " << metadata->n_trees << " (valid range: 0 to " << metadata->n_trees - 1 << ")" <<std::endl;`
`141`	`141`	`throw std::runtime_error("Invalid tree index");`
`142`	`142`	`}`
`143`	`143`	`}`