|
42 | 42 | { |
43 | 43 | "cell_type": "code", |
44 | 44 | "execution_count": 1, |
45 | | - "metadata": {}, |
| 45 | + "metadata": { |
| 46 | + "execution": { |
| 47 | + "iopub.execute_input": "2026-02-04T10:39:28.573489Z", |
| 48 | + "iopub.status.busy": "2026-02-04T10:39:28.573226Z", |
| 49 | + "iopub.status.idle": "2026-02-04T10:39:28.600670Z", |
| 50 | + "shell.execute_reply": "2026-02-04T10:39:28.599579Z" |
| 51 | + } |
| 52 | + }, |
46 | 53 | "outputs": [ |
47 | 54 | { |
48 | 55 | "name": "stdout", |
|
79 | 86 | { |
80 | 87 | "cell_type": "code", |
81 | 88 | "execution_count": 2, |
82 | | - "metadata": {}, |
| 89 | + "metadata": { |
| 90 | + "execution": { |
| 91 | + "iopub.execute_input": "2026-02-04T10:39:28.631145Z", |
| 92 | + "iopub.status.busy": "2026-02-04T10:39:28.630985Z", |
| 93 | + "iopub.status.idle": "2026-02-04T10:39:29.477955Z", |
| 94 | + "shell.execute_reply": "2026-02-04T10:39:29.477271Z" |
| 95 | + } |
| 96 | + }, |
83 | 97 | "outputs": [], |
84 | 98 | "source": [ |
85 | 99 | "import os\n", |
|
110 | 124 | { |
111 | 125 | "cell_type": "code", |
112 | 126 | "execution_count": 3, |
113 | | - "metadata": {}, |
| 127 | + "metadata": { |
| 128 | + "execution": { |
| 129 | + "iopub.execute_input": "2026-02-04T10:39:29.479905Z", |
| 130 | + "iopub.status.busy": "2026-02-04T10:39:29.479702Z", |
| 131 | + "iopub.status.idle": "2026-02-04T10:39:29.483683Z", |
| 132 | + "shell.execute_reply": "2026-02-04T10:39:29.482734Z" |
| 133 | + } |
| 134 | + }, |
114 | 135 | "outputs": [ |
115 | 136 | { |
116 | 137 | "name": "stdout", |
|
150 | 171 | { |
151 | 172 | "cell_type": "code", |
152 | 173 | "execution_count": 4, |
153 | | - "metadata": {}, |
| 174 | + "metadata": { |
| 175 | + "execution": { |
| 176 | + "iopub.execute_input": "2026-02-04T10:39:29.485903Z", |
| 177 | + "iopub.status.busy": "2026-02-04T10:39:29.485701Z", |
| 178 | + "iopub.status.idle": "2026-02-04T10:39:37.410855Z", |
| 179 | + "shell.execute_reply": "2026-02-04T10:39:37.409270Z" |
| 180 | + } |
| 181 | + }, |
154 | 182 | "outputs": [ |
155 | 183 | { |
156 | 184 | "name": "stdout", |
|
160 | 188 | "Total processes: 4\n", |
161 | 189 | "Total combinations: 17,640\n", |
162 | 190 | "Combinations per process: ~4,410\n", |
163 | | - "Process 1 will process 4410 combinations with batch size 128\n", |
164 | 191 | "Process 2 will process 4410 combinations with batch size 128\n", |
| 192 | + "Process 3 will process 4410 combinations with batch size 128\n", |
165 | 193 | "Process 0 will process 4410 combinations with batch size 128\n", |
166 | | - "Processing batches on device 0/4: 0%| | 0/35 [00:00<?, ?it/s]Process 3 will process 4410 combinations with batch size 128\n", |
167 | | - "Processing batches on device 0/4: 100%|██████████| 35/35 [00:04<00:00, 7.15it/s]\n", |
| 194 | + "Process 1 will process 4410 combinations with batch size 128\n", |
| 195 | + "Processing batches on device 0/4: 100%|██████████| 35/35 [00:04<00:00, 7.19it/s]\n", |
| 196 | + "[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3\n", |
168 | 197 | "[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3\n", |
169 | 198 | "[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3\n", |
170 | | - "[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3\n", |
171 | 199 | "[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3\n", |
172 | | - "Distributed grid search completed in 4.9 seconds\n", |
| 200 | + "Distributed grid search completed in 5.1 seconds\n", |
173 | 201 | "Results saved to 'distributed_results/' directory\n", |
174 | 202 | "Ready for result analysis in notebook!\n", |
175 | | - "CPU times: user 94.3 ms, sys: 24 ms, total: 118 ms\n", |
176 | | - "Wall time: 7.05 s\n" |
| 203 | + "CPU times: user 129 ms, sys: 30.7 ms, total: 160 ms\n", |
| 204 | + "Wall time: 9.49 s\n" |
177 | 205 | ] |
178 | 206 | } |
179 | 207 | ], |
180 | 208 | "source": [ |
181 | 209 | "%%time\n", |
182 | 210 | "# Run the distributed grid search\n", |
183 | 211 | "# Note: This cell will take 30-60 seconds to complete\n", |
184 | | - "!mpirun -n 4 python 05-distributed-grid-search.py" |
| 212 | + "!mpirun --oversubscribe -n 4 python 05-distributed-grid-search.py" |
185 | 213 | ] |
186 | 214 | }, |
187 | 215 | { |
|
198 | 226 | { |
199 | 227 | "cell_type": "code", |
200 | 228 | "execution_count": 5, |
201 | | - "metadata": {}, |
| 229 | + "metadata": { |
| 230 | + "execution": { |
| 231 | + "iopub.execute_input": "2026-02-04T10:39:37.676552Z", |
| 232 | + "iopub.status.busy": "2026-02-04T10:39:37.676217Z", |
| 233 | + "iopub.status.idle": "2026-02-04T10:39:46.254726Z", |
| 234 | + "shell.execute_reply": "2026-02-04T10:39:46.254056Z" |
| 235 | + } |
| 236 | + }, |
202 | 237 | "outputs": [ |
203 | 238 | { |
204 | 239 | "name": "stdout", |
|
212 | 247 | "name": "stderr", |
213 | 248 | "output_type": "stream", |
214 | 249 | "text": [ |
215 | | - "Loading results: 0%| | 0/140 [00:00<?, ?it/s]" |
216 | | - ] |
217 | | - }, |
218 | | - { |
219 | | - "name": "stderr", |
220 | | - "output_type": "stream", |
221 | | - "text": [ |
222 | | - "Loading results: 100%|██████████| 140/140 [00:06<00:00, 21.44it/s]\n", |
223 | | - "Converting to arrays: 100%|██████████| 13/13 [00:01<00:00, 9.15it/s]\n" |
| 250 | + "Loading results: 100%|██████████| 140/140 [00:06<00:00, 20.83it/s]\n", |
| 251 | + "Converting to arrays: 100%|██████████| 13/13 [00:01<00:00, 7.86it/s]\n" |
224 | 252 | ] |
225 | 253 | }, |
226 | 254 | { |
|
244 | 272 | "\n", |
245 | 273 | "# Use DistributedGridSearch to aggregate results\n", |
246 | 274 | "results = DistributedGridSearch.stack_results(\"distributed_results\")\n", |
| 275 | + "assert results is not None\n", |
247 | 276 | "\n", |
248 | | - "if results is None:\n", |
249 | | - " print(\"No results found! Check if the distributed run completed successfully.\")\n", |
250 | | - "else:\n", |
251 | | - " print(f\"Successfully loaded {len(results['value']):,} evaluations\")\n", |
252 | | - " print(f\"Result keys: {list(results.keys())}\")\n", |
253 | | - "\n", |
254 | | - " # Quick stats\n", |
255 | | - " print(\"\\nQuick Statistics:\")\n", |
256 | | - " print(f\" Best objective value: {results['value'][0]:.6f}\")\n", |
257 | | - " print(f\" Worst objective value: {results['value'][-1]:.6f}\")\n", |
258 | | - " print(f\" Mean objective value: {jnp.mean(results['value']):.6f}\")\n", |
259 | | - " print(f\" Objective std: {jnp.std(results['value']):.6f}\")" |
| 277 | + "print(f\"Successfully loaded {len(results['value']):,} evaluations\")\n", |
| 278 | + "print(f\"Result keys: {list(results.keys())}\")\n", |
| 279 | + "\n", |
| 280 | + "# Quick stats\n", |
| 281 | + "print(\"\\nQuick Statistics:\")\n", |
| 282 | + "print(f\" Best objective value: {results['value'][0]:.6f}\")\n", |
| 283 | + "print(f\" Worst objective value: {results['value'][-1]:.6f}\")\n", |
| 284 | + "print(f\" Mean objective value: {jnp.mean(results['value']):.6f}\")\n", |
| 285 | + "print(f\" Objective std: {jnp.std(results['value']):.6f}\")" |
260 | 286 | ] |
261 | 287 | }, |
262 | 288 | { |
|
269 | 295 | { |
270 | 296 | "cell_type": "code", |
271 | 297 | "execution_count": 6, |
272 | | - "metadata": {}, |
| 298 | + "metadata": { |
| 299 | + "execution": { |
| 300 | + "iopub.execute_input": "2026-02-04T10:39:46.256805Z", |
| 301 | + "iopub.status.busy": "2026-02-04T10:39:46.256676Z", |
| 302 | + "iopub.status.idle": "2026-02-04T10:39:46.260985Z", |
| 303 | + "shell.execute_reply": "2026-02-04T10:39:46.260495Z" |
| 304 | + } |
| 305 | + }, |
273 | 306 | "outputs": [ |
274 | 307 | { |
275 | 308 | "name": "stdout", |
|
339 | 372 | { |
340 | 373 | "cell_type": "code", |
341 | 374 | "execution_count": 7, |
342 | | - "metadata": {}, |
| 375 | + "metadata": { |
| 376 | + "execution": { |
| 377 | + "iopub.execute_input": "2026-02-04T10:39:46.262888Z", |
| 378 | + "iopub.status.busy": "2026-02-04T10:39:46.262767Z", |
| 379 | + "iopub.status.idle": "2026-02-04T10:39:47.554122Z", |
| 380 | + "shell.execute_reply": "2026-02-04T10:39:47.553724Z" |
| 381 | + } |
| 382 | + }, |
343 | 383 | "outputs": [ |
344 | 384 | { |
345 | 385 | "data": { |
|
422 | 462 | { |
423 | 463 | "cell_type": "code", |
424 | 464 | "execution_count": 8, |
425 | | - "metadata": {}, |
| 465 | + "metadata": { |
| 466 | + "execution": { |
| 467 | + "iopub.execute_input": "2026-02-04T10:39:47.556792Z", |
| 468 | + "iopub.status.busy": "2026-02-04T10:39:47.556679Z", |
| 469 | + "iopub.status.idle": "2026-02-04T10:39:47.560879Z", |
| 470 | + "shell.execute_reply": "2026-02-04T10:39:47.560478Z" |
| 471 | + } |
| 472 | + }, |
426 | 473 | "outputs": [ |
427 | 474 | { |
428 | 475 | "name": "stdout", |
|
0 commit comments