Skip to content

Commit 717c748

Browse files
authored
fixing notebook wf (#20)
* Fix distributed grid search notebook example workflow * Rename notebooks for better ordering
1 parent acb2848 commit 717c748

2 files changed

Lines changed: 86 additions & 38 deletions

File tree

Lines changed: 84 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,14 @@
4242
{
4343
"cell_type": "code",
4444
"execution_count": 1,
45-
"metadata": {},
45+
"metadata": {
46+
"execution": {
47+
"iopub.execute_input": "2026-02-04T10:39:28.573489Z",
48+
"iopub.status.busy": "2026-02-04T10:39:28.573226Z",
49+
"iopub.status.idle": "2026-02-04T10:39:28.600670Z",
50+
"shell.execute_reply": "2026-02-04T10:39:28.599579Z"
51+
}
52+
},
4653
"outputs": [
4754
{
4855
"name": "stdout",
@@ -79,7 +86,14 @@
7986
{
8087
"cell_type": "code",
8188
"execution_count": 2,
82-
"metadata": {},
89+
"metadata": {
90+
"execution": {
91+
"iopub.execute_input": "2026-02-04T10:39:28.631145Z",
92+
"iopub.status.busy": "2026-02-04T10:39:28.630985Z",
93+
"iopub.status.idle": "2026-02-04T10:39:29.477955Z",
94+
"shell.execute_reply": "2026-02-04T10:39:29.477271Z"
95+
}
96+
},
8397
"outputs": [],
8498
"source": [
8599
"import os\n",
@@ -110,7 +124,14 @@
110124
{
111125
"cell_type": "code",
112126
"execution_count": 3,
113-
"metadata": {},
127+
"metadata": {
128+
"execution": {
129+
"iopub.execute_input": "2026-02-04T10:39:29.479905Z",
130+
"iopub.status.busy": "2026-02-04T10:39:29.479702Z",
131+
"iopub.status.idle": "2026-02-04T10:39:29.483683Z",
132+
"shell.execute_reply": "2026-02-04T10:39:29.482734Z"
133+
}
134+
},
114135
"outputs": [
115136
{
116137
"name": "stdout",
@@ -150,7 +171,14 @@
150171
{
151172
"cell_type": "code",
152173
"execution_count": 4,
153-
"metadata": {},
174+
"metadata": {
175+
"execution": {
176+
"iopub.execute_input": "2026-02-04T10:39:29.485903Z",
177+
"iopub.status.busy": "2026-02-04T10:39:29.485701Z",
178+
"iopub.status.idle": "2026-02-04T10:39:37.410855Z",
179+
"shell.execute_reply": "2026-02-04T10:39:37.409270Z"
180+
}
181+
},
154182
"outputs": [
155183
{
156184
"name": "stdout",
@@ -160,28 +188,28 @@
160188
"Total processes: 4\n",
161189
"Total combinations: 17,640\n",
162190
"Combinations per process: ~4,410\n",
163-
"Process 1 will process 4410 combinations with batch size 128\n",
164191
"Process 2 will process 4410 combinations with batch size 128\n",
192+
"Process 3 will process 4410 combinations with batch size 128\n",
165193
"Process 0 will process 4410 combinations with batch size 128\n",
166-
"Processing batches on device 0/4: 0%| | 0/35 [00:00<?, ?it/s]Process 3 will process 4410 combinations with batch size 128\n",
167-
"Processing batches on device 0/4: 100%|██████████| 35/35 [00:04<00:00, 7.15it/s]\n",
194+
"Process 1 will process 4410 combinations with batch size 128\n",
195+
"Processing batches on device 0/4: 100%|██████████| 35/35 [00:04<00:00, 7.19it/s]\n",
196+
"[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3\n",
168197
"[Gloo] Rank 1 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3\n",
169198
"[Gloo] Rank 2 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3\n",
170-
"[Gloo] Rank 0 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3\n",
171199
"[Gloo] Rank 3 is connected to 3 peer ranks. Expected number of connected peer ranks is : 3\n",
172-
"Distributed grid search completed in 4.9 seconds\n",
200+
"Distributed grid search completed in 5.1 seconds\n",
173201
"Results saved to 'distributed_results/' directory\n",
174202
"Ready for result analysis in notebook!\n",
175-
"CPU times: user 94.3 ms, sys: 24 ms, total: 118 ms\n",
176-
"Wall time: 7.05 s\n"
203+
"CPU times: user 129 ms, sys: 30.7 ms, total: 160 ms\n",
204+
"Wall time: 9.49 s\n"
177205
]
178206
}
179207
],
180208
"source": [
181209
"%%time\n",
182210
"# Run the distributed grid search\n",
183211
"# Note: This cell will take 30-60 seconds to complete\n",
184-
"!mpirun -n 4 python 05-distributed-grid-search.py"
212+
"!mpirun --oversubscribe -n 4 python 05-distributed-grid-search.py"
185213
]
186214
},
187215
{
@@ -198,7 +226,14 @@
198226
{
199227
"cell_type": "code",
200228
"execution_count": 5,
201-
"metadata": {},
229+
"metadata": {
230+
"execution": {
231+
"iopub.execute_input": "2026-02-04T10:39:37.676552Z",
232+
"iopub.status.busy": "2026-02-04T10:39:37.676217Z",
233+
"iopub.status.idle": "2026-02-04T10:39:46.254726Z",
234+
"shell.execute_reply": "2026-02-04T10:39:46.254056Z"
235+
}
236+
},
202237
"outputs": [
203238
{
204239
"name": "stdout",
@@ -212,15 +247,8 @@
212247
"name": "stderr",
213248
"output_type": "stream",
214249
"text": [
215-
"Loading results: 0%| | 0/140 [00:00<?, ?it/s]"
216-
]
217-
},
218-
{
219-
"name": "stderr",
220-
"output_type": "stream",
221-
"text": [
222-
"Loading results: 100%|██████████| 140/140 [00:06<00:00, 21.44it/s]\n",
223-
"Converting to arrays: 100%|██████████| 13/13 [00:01<00:00, 9.15it/s]\n"
250+
"Loading results: 100%|██████████| 140/140 [00:06<00:00, 20.83it/s]\n",
251+
"Converting to arrays: 100%|██████████| 13/13 [00:01<00:00, 7.86it/s]\n"
224252
]
225253
},
226254
{
@@ -244,19 +272,17 @@
244272
"\n",
245273
"# Use DistributedGridSearch to aggregate results\n",
246274
"results = DistributedGridSearch.stack_results(\"distributed_results\")\n",
275+
"assert results is not None\n",
247276
"\n",
248-
"if results is None:\n",
249-
" print(\"No results found! Check if the distributed run completed successfully.\")\n",
250-
"else:\n",
251-
" print(f\"Successfully loaded {len(results['value']):,} evaluations\")\n",
252-
" print(f\"Result keys: {list(results.keys())}\")\n",
253-
"\n",
254-
" # Quick stats\n",
255-
" print(\"\\nQuick Statistics:\")\n",
256-
" print(f\" Best objective value: {results['value'][0]:.6f}\")\n",
257-
" print(f\" Worst objective value: {results['value'][-1]:.6f}\")\n",
258-
" print(f\" Mean objective value: {jnp.mean(results['value']):.6f}\")\n",
259-
" print(f\" Objective std: {jnp.std(results['value']):.6f}\")"
277+
"print(f\"Successfully loaded {len(results['value']):,} evaluations\")\n",
278+
"print(f\"Result keys: {list(results.keys())}\")\n",
279+
"\n",
280+
"# Quick stats\n",
281+
"print(\"\\nQuick Statistics:\")\n",
282+
"print(f\" Best objective value: {results['value'][0]:.6f}\")\n",
283+
"print(f\" Worst objective value: {results['value'][-1]:.6f}\")\n",
284+
"print(f\" Mean objective value: {jnp.mean(results['value']):.6f}\")\n",
285+
"print(f\" Objective std: {jnp.std(results['value']):.6f}\")"
260286
]
261287
},
262288
{
@@ -269,7 +295,14 @@
269295
{
270296
"cell_type": "code",
271297
"execution_count": 6,
272-
"metadata": {},
298+
"metadata": {
299+
"execution": {
300+
"iopub.execute_input": "2026-02-04T10:39:46.256805Z",
301+
"iopub.status.busy": "2026-02-04T10:39:46.256676Z",
302+
"iopub.status.idle": "2026-02-04T10:39:46.260985Z",
303+
"shell.execute_reply": "2026-02-04T10:39:46.260495Z"
304+
}
305+
},
273306
"outputs": [
274307
{
275308
"name": "stdout",
@@ -339,7 +372,14 @@
339372
{
340373
"cell_type": "code",
341374
"execution_count": 7,
342-
"metadata": {},
375+
"metadata": {
376+
"execution": {
377+
"iopub.execute_input": "2026-02-04T10:39:46.262888Z",
378+
"iopub.status.busy": "2026-02-04T10:39:46.262767Z",
379+
"iopub.status.idle": "2026-02-04T10:39:47.554122Z",
380+
"shell.execute_reply": "2026-02-04T10:39:47.553724Z"
381+
}
382+
},
343383
"outputs": [
344384
{
345385
"data": {
@@ -422,7 +462,14 @@
422462
{
423463
"cell_type": "code",
424464
"execution_count": 8,
425-
"metadata": {},
465+
"metadata": {
466+
"execution": {
467+
"iopub.execute_input": "2026-02-04T10:39:47.556792Z",
468+
"iopub.status.busy": "2026-02-04T10:39:47.556679Z",
469+
"iopub.status.idle": "2026-02-04T10:39:47.560879Z",
470+
"shell.execute_reply": "2026-02-04T10:39:47.560478Z"
471+
}
472+
},
426473
"outputs": [
427474
{
428475
"name": "stdout",
Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,8 @@
77
88
Usage:
99
mpirun -n 4 python 05-distributed-grid-search.py
10-
mpirun -n 8 python 05-distributed-grid-search.py
10+
# If running on a machine with fewer than 4 cores (e.g. CI), use --oversubscribe:
11+
mpirun --oversubscribe -n 4 python 05-distributed-grid-search.py
1112
"""
1213

1314
import os

0 commit comments

Comments
 (0)