|
59 | 59 | */ |
60 | 60 | void get_matrix_representation_cuda(dataSet *dataset, ensembleMetaData *metadata, ensembleData *edata, SGDOptimizerGPU** opts, const int n_opts, matrixRepresentation *matrix){ |
61 | 61 | int n_samples = dataset->n_samples; |
| 62 | + int output_dim = metadata->output_dim; |
62 | 63 | float *device_batch_obs; |
63 | 64 | char *device_batch_cat_obs; |
64 | 65 | char *device_data; |
65 | 66 | float *device_V; |
66 | 67 | bool *device_A; |
67 | 68 | // assuming row-major order |
68 | 69 | size_t A_size = dataset->n_samples * (metadata->n_leaves+1) * sizeof(bool); |
69 | | - size_t V_size = (metadata->n_leaves+1) * metadata->output_dim * sizeof(float); |
| 70 | + size_t V_size = (metadata->n_leaves+1) * output_dim * sizeof(float); |
70 | 71 | size_t obs_matrix_size = dataset->n_samples * metadata->n_num_features * sizeof(float); |
71 | 72 | size_t cat_obs_matrix_size = dataset->n_samples * metadata->n_cat_features * sizeof(char) * MAX_CHAR_SIZE; |
72 | | - cudaError_t alloc_error = allocateCudaMemory((void**)&device_data, obs_matrix_size + cat_obs_matrix_size + A_size + V_size, "when trying to allocate matrix representation"); |
| 73 | + |
| 74 | + // Calculate allocation size based on what data is already on device |
| 75 | + size_t extra_alloc_size = 0; |
| 76 | + bool obs_on_device = (dataset->obs != nullptr && dataset->obs->data != nullptr && dataset->obs->device != cpu); |
| 77 | + bool cat_on_device = (dataset->categorical_obs != nullptr && dataset->categorical_obs->data != nullptr && dataset->categorical_obs->device != cpu); |
| 78 | + |
| 79 | + if (!obs_on_device) extra_alloc_size += obs_matrix_size; |
| 80 | + if (!cat_on_device) extra_alloc_size += cat_obs_matrix_size; |
| 81 | + |
| 82 | + cudaError_t alloc_error = allocateCudaMemory((void**)&device_data, extra_alloc_size + A_size + V_size, "when trying to allocate matrix representation"); |
73 | 83 | if (alloc_error != cudaSuccess) { |
74 | 84 | return; |
75 | 85 | } |
76 | | - |
77 | | - // Allocate host buffer |
78 | | - char* host_data = new char[obs_matrix_size + cat_obs_matrix_size + A_size + V_size]; |
79 | | - memset(host_data, 0, obs_matrix_size + cat_obs_matrix_size + A_size + V_size); |
80 | | - // Copy data into host buffer |
81 | | - if (dataset->obs != nullptr && dataset->obs->data != nullptr) { |
82 | | - std::memcpy(host_data, dataset->obs->data, obs_matrix_size); |
83 | | - } |
84 | | - if (dataset->categorical_obs != nullptr && dataset->categorical_obs->data != nullptr) { |
85 | | - std::memcpy(host_data + obs_matrix_size + V_size + A_size, dataset->categorical_obs->data, cat_obs_matrix_size); |
86 | | - } |
87 | | - |
88 | | - cudaMemcpy(device_data, host_data, obs_matrix_size + cat_obs_matrix_size + A_size + V_size, cudaMemcpyHostToDevice); |
89 | | - delete[] host_data; |
| 86 | + cudaMemset(device_data, 0, extra_alloc_size + A_size + V_size); |
90 | 87 |
|
91 | 88 | size_t trace = 0; |
92 | | - device_batch_obs = (float*)device_data; |
93 | | - trace += obs_matrix_size; |
94 | 89 | device_V = (float *)(device_data + trace); |
95 | 90 | trace += V_size; |
96 | 91 | device_A = (bool *)(device_data + trace); |
97 | 92 | trace += A_size; |
98 | | - device_batch_cat_obs = (char *)(device_data + trace); |
| 93 | + |
| 94 | + // Handle obs data - device-aware copy |
| 95 | + if (dataset->obs != nullptr && dataset->obs->data != nullptr) { |
| 96 | + if (obs_on_device) { |
| 97 | + device_batch_obs = const_cast<float*>(dataset->obs->data); |
| 98 | + } else { |
| 99 | + device_batch_obs = (float*)(device_data + trace); |
| 100 | + trace += obs_matrix_size; |
| 101 | + cudaMemcpy(device_batch_obs, dataset->obs->data, obs_matrix_size, cudaMemcpyHostToDevice); |
| 102 | + } |
| 103 | + } else { |
| 104 | + device_batch_obs = nullptr; |
| 105 | + } |
| 106 | + |
| 107 | + // Handle categorical obs data - device-aware copy |
| 108 | + if (dataset->categorical_obs != nullptr && dataset->categorical_obs->data != nullptr) { |
| 109 | + if (cat_on_device) { |
| 110 | + device_batch_cat_obs = const_cast<char*>(dataset->categorical_obs->data); |
| 111 | + } else { |
| 112 | + device_batch_cat_obs = (char*)(device_data + trace); |
| 113 | + cudaMemcpy(device_batch_cat_obs, dataset->categorical_obs->data, cat_obs_matrix_size, cudaMemcpyHostToDevice); |
| 114 | + } |
| 115 | + } else { |
| 116 | + device_batch_cat_obs = nullptr; |
| 117 | + } |
99 | 118 |
|
100 | 119 | int n_blocks, threads_per_block; |
101 | 120 | get_grid_dimensions(dataset->n_samples, n_blocks, threads_per_block); |
102 | | - cudaMemcpy(device_V, edata->bias, sizeof(float)*metadata->output_dim, cudaMemcpyDeviceToDevice); |
| 121 | + cudaMemcpy(device_V, edata->bias, sizeof(float)*output_dim, cudaMemcpyDeviceToDevice); |
103 | 122 |
|
104 | 123 | if (n_opts == 0){ |
105 | 124 | std::cerr << "No optimizers." << std::endl; |
@@ -133,11 +152,14 @@ void get_matrix_representation_cuda(dataSet *dataset, ensembleMetaData *metadata |
133 | 152 | n_blocks = metadata->n_leaves / THREADS_PER_BLOCK + 1; |
134 | 153 | get_V_kernel<<<n_blocks, THREADS_PER_BLOCK>>>(device_V, edata->leaf_data->values, opts, n_opts, metadata->output_dim, metadata->n_leaves); |
135 | 154 | cudaDeviceSynchronize(); |
136 | | - matrix->A = new bool[A_size]; |
| 155 | + // Allocate by element count, not byte size |
| 156 | + int A_elems = n_samples * (metadata->n_leaves + 1); |
| 157 | + int V_elems = (metadata->n_leaves + 1) * output_dim; |
| 158 | + matrix->A = new bool[A_elems]; |
137 | 159 | cudaMemcpy(matrix->A, device_A, A_size, cudaMemcpyDeviceToHost); |
138 | 160 | for (int i = 0; i < n_samples; i++) |
139 | 161 | matrix->A[i*(metadata->n_leaves + 1)] = true; |
140 | | - matrix->V = new float[V_size]; |
| 162 | + matrix->V = new float[V_elems]; |
141 | 163 | cudaMemcpy(matrix->V, device_V, V_size, cudaMemcpyDeviceToHost); |
142 | 164 | // Copy results back to CPU |
143 | 165 | matrix->n_leaves = metadata->n_leaves; |
|
0 commit comments