Skip to content

Commit ec976e6

Browse files
committed
fixing bugs
1 parent e02c7f1 commit ec976e6

File tree

7 files changed

+85
-42
lines changed

7 files changed

+85
-42
lines changed

gbrl/common/compression.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,9 @@ def __init__(self, k: int, gradient_steps: int, n_trees: int, n_leaves_per_tree:
227227
self.method = method
228228
self.use_W = use_W
229229
if list(self.compression.parameters()):
230+
# Guard against None optimizer_kwargs
231+
if optimizer_kwargs is None:
232+
optimizer_kwargs = {}
230233
self.optimizer = optimizer_class(self.compression.parameters(), **optimizer_kwargs)
231234
self.gradient_steps = gradient_steps
232235

@@ -413,7 +416,7 @@ def compress(self, A: th.Tensor, V: th.Tensor, actions: th.Tensor,
413416
self.optimizer.zero_grad()
414417
loss.backward()
415418
self.optimizer.step()
416-
losses.append(loss)
419+
losses.append(loss.item())
417420
print(f"{i + 1}/{self.gradient_steps} - compression loss: {loss.item()}")
418421
return self.compression.get_parameters(A, V), losses
419422

gbrl/learners/actor_critic_learner.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -203,11 +203,13 @@ def compress(self, trees_to_keep: int, gradient_steps: int, features: NumericalD
203203
trees_to_keep (int): Number of trees to retain in the compressed model.
204204
gradient_steps (int): Number of optimization steps during compression.
205205
features (NumericalData): Input feature matrix (n_samples, n_features).
206-
actions (th.Tensor, optional): Target actions (for policy compression). Required if dist_type
207-
is not 'supervised_learning'.
206+
actions (th.Tensor, optional): Target actions (for policy compression). Required
207+
unless dist_type is 'deterministic' or 'supervised_learning'.
208208
log_std (th.Tensor, optional): Log standard deviation (only used for certain policy types).
209209
method (str): Tree selection method. Defaults to 'first_k'.
210-
dist_type (str): Compression type ('supervised_learning', 'actor', etc.).
210+
dist_type (str): Compression type. Supported: 'deterministic', 'supervised_learning',
211+
'categorical', 'gaussian'. For 'deterministic' and 'supervised_learning', actions
212+
are not required.
211213
optimizer_kwargs (dict, optional): Optimizer configuration.
212214
temperature (float): Temperature parameter for soft selection.
213215
lambda_reg (float): L2 regularization coefficient on weights.
@@ -216,9 +218,11 @@ def compress(self, trees_to_keep: int, gradient_steps: int, features: NumericalD
216218
Returns:
217219
float: Final loss value after compression.
218220
"""
219-
assert actions is not None, "Cannot compress a shared actor-critic policy without actions"
220-
assert dist_type != 'supervised_learning', \
221-
"Cannot compress a shared actor-critic policy using supervised learning compression methods"
221+
# Actions are only required for probabilistic policy compression
222+
if dist_type not in {'deterministic', 'supervised_learning'}:
223+
assert actions is not None, \
224+
f"Cannot compress with dist_type='{dist_type}' without actions. " \
225+
"Actions are required for probabilistic policy compression (categorical, gaussian)."
222226

223227
A, V, n_leaves_per_tree, n_leaves, n_trees = self.get_matrix_representation(features)
224228
# Convert to tensors with explicit dtypes
@@ -255,8 +259,10 @@ def compress(self, trees_to_keep: int, gradient_steps: int, features: NumericalD
255259
# Compute new tree indices for compressed model
256260
new_tree_indices = np.zeros(n_compressed_trees, dtype=np.int32)
257261
if n_compressed_trees > 1:
262+
# Convert tensor to CPU numpy before indexing with numpy array
263+
n_leaves_per_tree_np = n_leaves_per_tree.cpu().numpy()
258264
new_tree_indices[1:] = np.cumsum(
259-
n_leaves_per_tree[compressed_tree_indices].cpu().numpy()
265+
n_leaves_per_tree_np[compressed_tree_indices]
260266
)[:-1].astype(np.int32)
261267

262268
self._cpp_model.compress(n_compressed_leaves, n_compressed_trees, compressed_leaf_indices,

gbrl/learners/gbt_learner.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -658,8 +658,10 @@ def compress(self, trees_to_keep: int, gradient_steps: int, features: NumericalD
658658
# Compute new tree indices for compressed model
659659
new_tree_indices = np.zeros(n_compressed_trees, dtype=np.int32)
660660
if n_compressed_trees > 1:
661+
# Convert tensor to CPU numpy before indexing with numpy array
662+
n_leaves_per_tree_np = n_leaves_per_tree.cpu().numpy()
661663
new_tree_indices[1:] = np.cumsum(
662-
n_leaves_per_tree[compressed_tree_indices].cpu().numpy()
664+
n_leaves_per_tree_np[compressed_tree_indices]
663665
)[:-1].astype(np.int32)
664666

665667
self._cpp_model.compress(n_compressed_leaves, n_compressed_trees, compressed_leaf_indices,

gbrl/learners/multi_gbt_learner.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -922,8 +922,10 @@ def _compress(A, V, n_leaves_per_tree, compression_params, cpp_model, model_idx)
922922
# Compute new tree indices for compressed model
923923
new_tree_indices = np.zeros(n_compressed_trees, dtype=np.int32)
924924
if n_compressed_trees > 1:
925+
# Convert tensor to CPU numpy before indexing with numpy array
926+
n_leaves_per_tree_np = n_leaves_per_tree.cpu().numpy()
925927
new_tree_indices[1:] = np.cumsum(
926-
n_leaves_per_tree[compressed_tree_indices].cpu().numpy()
928+
n_leaves_per_tree_np[compressed_tree_indices]
927929
)[:-1].astype(np.int32)
928930

929931
cpp_model.compress(n_compressed_leaves, n_compressed_trees, compressed_leaf_indices,

gbrl/src/cpp/utils.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,8 +136,8 @@ void selective_copy(const int num_indices, const int* indices, T* dest, const T*
136136
void selective_copy_char(const int num_indices, const int* indices, char* dest, const char* src, const int elements_dim);
137137

138138
inline void valid_tree_idx(const int tree_idx, const ensembleMetaData* metadata){
139-
if (tree_idx < 0 || tree_idx > metadata->n_trees){
140-
std::cerr << "ERROR: invalid tree_idx " << tree_idx << " in ensemble with ntrees = " << metadata->n_trees <<std::endl;
139+
if (tree_idx < 0 || tree_idx >= metadata->n_trees){
140+
std::cerr << "ERROR: invalid tree_idx " << tree_idx << " in ensemble with ntrees = " << metadata->n_trees << " (valid range: 0 to " << metadata->n_trees - 1 << ")" <<std::endl;
141141
throw std::runtime_error("Invalid tree index");
142142
}
143143
}

gbrl/src/cuda/cuda_compression.cu

Lines changed: 44 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -59,47 +59,66 @@
5959
*/
6060
void get_matrix_representation_cuda(dataSet *dataset, ensembleMetaData *metadata, ensembleData *edata, SGDOptimizerGPU** opts, const int n_opts, matrixRepresentation *matrix){
6161
int n_samples = dataset->n_samples;
62+
int output_dim = metadata->output_dim;
6263
float *device_batch_obs;
6364
char *device_batch_cat_obs;
6465
char *device_data;
6566
float *device_V;
6667
bool *device_A;
6768
// assuming row-major order
6869
size_t A_size = dataset->n_samples * (metadata->n_leaves+1) * sizeof(bool);
69-
size_t V_size = (metadata->n_leaves+1) * metadata->output_dim * sizeof(float);
70+
size_t V_size = (metadata->n_leaves+1) * output_dim * sizeof(float);
7071
size_t obs_matrix_size = dataset->n_samples * metadata->n_num_features * sizeof(float);
7172
size_t cat_obs_matrix_size = dataset->n_samples * metadata->n_cat_features * sizeof(char) * MAX_CHAR_SIZE;
72-
cudaError_t alloc_error = allocateCudaMemory((void**)&device_data, obs_matrix_size + cat_obs_matrix_size + A_size + V_size, "when trying to allocate matrix representation");
73+
74+
// Calculate allocation size based on what data is already on device
75+
size_t extra_alloc_size = 0;
76+
bool obs_on_device = (dataset->obs != nullptr && dataset->obs->data != nullptr && dataset->obs->device != cpu);
77+
bool cat_on_device = (dataset->categorical_obs != nullptr && dataset->categorical_obs->data != nullptr && dataset->categorical_obs->device != cpu);
78+
79+
if (!obs_on_device) extra_alloc_size += obs_matrix_size;
80+
if (!cat_on_device) extra_alloc_size += cat_obs_matrix_size;
81+
82+
cudaError_t alloc_error = allocateCudaMemory((void**)&device_data, extra_alloc_size + A_size + V_size, "when trying to allocate matrix representation");
7383
if (alloc_error != cudaSuccess) {
7484
return;
7585
}
76-
77-
// Allocate host buffer
78-
char* host_data = new char[obs_matrix_size + cat_obs_matrix_size + A_size + V_size];
79-
memset(host_data, 0, obs_matrix_size + cat_obs_matrix_size + A_size + V_size);
80-
// Copy data into host buffer
81-
if (dataset->obs != nullptr && dataset->obs->data != nullptr) {
82-
std::memcpy(host_data, dataset->obs->data, obs_matrix_size);
83-
}
84-
if (dataset->categorical_obs != nullptr && dataset->categorical_obs->data != nullptr) {
85-
std::memcpy(host_data + obs_matrix_size + V_size + A_size, dataset->categorical_obs->data, cat_obs_matrix_size);
86-
}
87-
88-
cudaMemcpy(device_data, host_data, obs_matrix_size + cat_obs_matrix_size + A_size + V_size, cudaMemcpyHostToDevice);
89-
delete[] host_data;
86+
cudaMemset(device_data, 0, extra_alloc_size + A_size + V_size);
9087

9188
size_t trace = 0;
92-
device_batch_obs = (float*)device_data;
93-
trace += obs_matrix_size;
9489
device_V = (float *)(device_data + trace);
9590
trace += V_size;
9691
device_A = (bool *)(device_data + trace);
9792
trace += A_size;
98-
device_batch_cat_obs = (char *)(device_data + trace);
93+
94+
// Handle obs data - device-aware copy
95+
if (dataset->obs != nullptr && dataset->obs->data != nullptr) {
96+
if (obs_on_device) {
97+
device_batch_obs = const_cast<float*>(dataset->obs->data);
98+
} else {
99+
device_batch_obs = (float*)(device_data + trace);
100+
trace += obs_matrix_size;
101+
cudaMemcpy(device_batch_obs, dataset->obs->data, obs_matrix_size, cudaMemcpyHostToDevice);
102+
}
103+
} else {
104+
device_batch_obs = nullptr;
105+
}
106+
107+
// Handle categorical obs data - device-aware copy
108+
if (dataset->categorical_obs != nullptr && dataset->categorical_obs->data != nullptr) {
109+
if (cat_on_device) {
110+
device_batch_cat_obs = const_cast<char*>(dataset->categorical_obs->data);
111+
} else {
112+
device_batch_cat_obs = (char*)(device_data + trace);
113+
cudaMemcpy(device_batch_cat_obs, dataset->categorical_obs->data, cat_obs_matrix_size, cudaMemcpyHostToDevice);
114+
}
115+
} else {
116+
device_batch_cat_obs = nullptr;
117+
}
99118

100119
int n_blocks, threads_per_block;
101120
get_grid_dimensions(dataset->n_samples, n_blocks, threads_per_block);
102-
cudaMemcpy(device_V, edata->bias, sizeof(float)*metadata->output_dim, cudaMemcpyDeviceToDevice);
121+
cudaMemcpy(device_V, edata->bias, sizeof(float)*output_dim, cudaMemcpyDeviceToDevice);
103122

104123
if (n_opts == 0){
105124
std::cerr << "No optimizers." << std::endl;
@@ -133,11 +152,14 @@ void get_matrix_representation_cuda(dataSet *dataset, ensembleMetaData *metadata
133152
n_blocks = metadata->n_leaves / THREADS_PER_BLOCK + 1;
134153
get_V_kernel<<<n_blocks, THREADS_PER_BLOCK>>>(device_V, edata->leaf_data->values, opts, n_opts, metadata->output_dim, metadata->n_leaves);
135154
cudaDeviceSynchronize();
136-
matrix->A = new bool[A_size];
155+
// Allocate by element count, not byte size
156+
int A_elems = n_samples * (metadata->n_leaves + 1);
157+
int V_elems = (metadata->n_leaves + 1) * output_dim;
158+
matrix->A = new bool[A_elems];
137159
cudaMemcpy(matrix->A, device_A, A_size, cudaMemcpyDeviceToHost);
138160
for (int i = 0; i < n_samples; i++)
139161
matrix->A[i*(metadata->n_leaves + 1)] = true;
140-
matrix->V = new float[V_size];
162+
matrix->V = new float[V_elems];
141163
cudaMemcpy(matrix->V, device_V, V_size, cudaMemcpyDeviceToHost);
142164
// Copy results back to CPU
143165
matrix->n_leaves = metadata->n_leaves;

gbrl/src/cuda/cuda_fitter.cu

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -682,10 +682,14 @@ __global__ void split_score_l2_cuda(
682682
(direction == -1 && l_val < r_val);
683683

684684
if (violation) {
685-
// Pool the means for this output dimension
686-
float pooled = (l_val + r_val) * 0.5f;
687-
left_mean[out_idx] = pooled;
688-
right_mean[out_idx] = pooled;
685+
// Pool the means using count-weighted averaging
686+
int total_count = l_count[0] + r_count[0];
687+
if (total_count > 0) {
688+
float pooled = (l_val * l_count[0] + r_val * r_count[0]) / total_count;
689+
left_mean[out_idx] = pooled;
690+
right_mean[out_idx] = pooled;
691+
}
692+
// If total_count == 0, keep existing values (no samples to pool)
689693
}
690694
}
691695

@@ -1875,11 +1879,10 @@ void apply_monotonic_constraints_cuda(
18751879
tree_depth * sizeof(int),
18761880
cudaMemcpyDeviceToHost);
18771881

1878-
// FIX: Copy inequality directions for this tree
1882+
// FIX: Copy inequality directions for this tree from per-depth base (not per-leaf)
18791883
bool* h_inequality_directions = new bool[tree_depth];
1880-
int ineq_base = start_leaf_idx * metadata->max_depth;
18811884
cudaMemcpy(h_inequality_directions,
1882-
edata->feature_data->inequality_directions + ineq_base,
1885+
edata->feature_data->inequality_directions + tree_idx * metadata->max_depth,
18831886
tree_depth * sizeof(bool),
18841887
cudaMemcpyDeviceToHost);
18851888

@@ -1915,9 +1918,14 @@ void apply_monotonic_constraints_cuda(
19151918
int constraint_output = h_mono_output_idx[c];
19161919

19171920
for (int d = 0; d < tree_depth; ++d) {
1918-
// Convert internal feature index to global using reverse mapping
1921+
// Convert internal feature index to global using reverse mapping with bounds checks
19191922
int internal_idx = h_feature_indices[d];
1923+
if (internal_idx < 0 || internal_idx >= metadata->n_num_features) continue;
1924+
19201925
int global_idx = h_reverse_mapping[internal_idx];
1926+
// Total features = n_num_features + n_cat_features
1927+
int total_features = metadata->n_num_features + metadata->n_cat_features;
1928+
if (global_idx < 0 || global_idx >= total_features) continue;
19211929

19221930
if (global_idx == global_feature_idx) {
19231931
// If inequality_direction is inverted (false), flip the constraint

0 commit comments

Comments
 (0)