1
1
"""This is called to run a trial by worker nodes (local / remote)."""
2
2
3
3
4
+ def assign_delete_permissions (
5
+ aci_client ,
6
+ auth_client ,
7
+ max_undead_containers ,
8
+ credential ,
9
+ subscription_id ,
10
+ client_id ,
11
+ resource_group_name ,
12
+ container_groups ,
13
+ ):
14
+ from heapq import heappush , heappop
15
+ from datetime import datetime
16
+ import time
17
+ import uuid
18
+ from azure .mgmt .containerinstance import ContainerInstanceManagementClient
19
+ from azure .mgmt .authorization import AuthorizationManagementClient
20
+ from azure .mgmt .authorization .models import RoleAssignmentCreateParameters
21
+ from azure .core .exceptions import HttpResponseError
22
+
23
+ # Contributor Role
24
+ role_definition_id = f"/subscriptions/{ subscription_id } /providers/Microsoft.Authorization/roleDefinitions/b24988ac-6180-42a0-ab88-20f7382dd24c"
25
+
26
+ while max_undead_containers < len (container_groups ):
27
+ _ , container_group_name , started = heappop (container_groups )
28
+ try :
29
+ if started is not None :
30
+ if not started .done ():
31
+ heappush (
32
+ container_groups ,
33
+ (datetime .now (), container_group_name , started ),
34
+ )
35
+ time .sleep (1 )
36
+ continue
37
+ started = None
38
+
39
+ if aci_client is None :
40
+ aci_client = ContainerInstanceManagementClient (
41
+ credential , subscription_id
42
+ )
43
+
44
+ container_group = aci_client .container_groups .get (
45
+ resource_group_name , container_group_name
46
+ )
47
+
48
+ role_assignment_params1 = RoleAssignmentCreateParameters (
49
+ role_definition_id = role_definition_id ,
50
+ principal_id = container_group .identity .principal_id ,
51
+ principal_type = "ServicePrincipal" ,
52
+ )
53
+ role_assignment_params2 = RoleAssignmentCreateParameters (
54
+ role_definition_id = role_definition_id ,
55
+ principal_id = client_id ,
56
+ principal_type = "User" ,
57
+ )
58
+ scope = f"/subscriptions/{ subscription_id } /resourceGroups/{ resource_group_name } /providers/Microsoft.ContainerInstance/containerGroups/{ container_group_name } "
59
+
60
+ if auth_client is None :
61
+ auth_client = AuthorizationManagementClient (credential , subscription_id )
62
+
63
+ auth_client .role_assignments .create (
64
+ scope , str (uuid .uuid4 ()), role_assignment_params1
65
+ )
66
+ auth_client .role_assignments .create (
67
+ scope , str (uuid .uuid4 ()), role_assignment_params2
68
+ )
69
+ except HttpResponseError :
70
+ aci_client = None
71
+ auth_client = None
72
+ heappush (container_groups , (datetime .now (), container_group_name , started ))
73
+ time .sleep (1 )
74
+
75
+ return aci_client , auth_client
76
+
77
+
4
78
def run_azure_process (
5
79
experiment_id ,
6
80
n_runners ,
@@ -16,46 +90,33 @@ def run_azure_process(
16
90
):
17
91
startup_script = """
18
92
self_delete() {
19
- echo "Attempt to self-delete this container group. Exit code was $1."
93
+ echo "Attempt to self-delete this container group. Exit code was: $1"
94
+
95
+ if [ $1 -ne 0 ]; then
96
+ echo "Waiting 10 mintues to allow inspection of the logs..."
97
+ sleep 600
98
+ fi
99
+
100
+ SUBSCRIPTION_ID=${SUBSCRIPTION_ID}
101
+ RESOURCE_GROUP_NAME=${RESOURCE_GROUP_NAME}
102
+ CONTAINER_GROUP_NAME=${CONTAINER_GROUP_NAME}
20
103
21
104
retry_count=0
22
105
while true; do
23
106
echo "Downloading azure tools."
24
107
25
108
curl -sL https://aka.ms/InstallAzureCLIDeb -o install_script.sh
26
109
exit_code=$?
27
- if [ $exit_code -eq 0 ]; then
28
- break
110
+ if [ $exit_code -ne 0 ]; then
111
+ echo "curl failed with exit code $exit_code."
29
112
fi
30
113
31
- echo "curl failed with exit code $exit_code."
32
- if [ $retry_count -ge 300 ]; then
33
- echo "Maximum number of retries reached. Command failed."
34
- exit 62
114
+ bash install_script.sh
115
+ exit_code=$?
116
+ if [ $exit_code -ne 0 ]; then
117
+ echo "Failed to install azure tools with exit code $exit_code. Attempting to delete anyway."
35
118
fi
36
- retry_count=$((retry_count + 1))
37
- echo "Sleeping."
38
- sleep 300
39
- echo "Retrying."
40
- done
41
-
42
- bash install_script.sh
43
- exit_code=$?
44
- if [ $exit_code -ne 0 ]; then
45
- echo "Failed to install azure tools with exit code $exit_code. Attempting to delete anyway."
46
- fi
47
-
48
- SUBSCRIPTION_ID=${SUBSCRIPTION_ID}
49
- RESOURCE_GROUP_NAME=${RESOURCE_GROUP_NAME}
50
- CONTAINER_GROUP_NAME=${CONTAINER_GROUP_NAME}
51
119
52
- if [ $1 -ne 0 ]; then
53
- echo "Waiting 10 mintues to allow inspection of the logs..."
54
- sleep 600
55
- fi
56
-
57
- retry_count=0
58
- while true; do
59
120
echo "Logging into azure to delete this container."
60
121
az login --identity
61
122
exit_code=$?
@@ -78,8 +139,10 @@ def run_azure_process(
78
139
break
79
140
fi
80
141
retry_count=$((retry_count + 1))
81
- done
82
142
143
+ echo "Retrying."
144
+ done
145
+
83
146
exit $1 # failed to self-kill the container we are running this on.
84
147
}
85
148
@@ -297,12 +360,10 @@ def run_azure_process(
297
360
298
361
import time
299
362
import uuid
300
- from multiprocessing . pool import MaybeEncodingError
301
-
363
+ from heapq import heappush
364
+ from datetime import datetime
302
365
from azure .core .exceptions import HttpResponseError
303
366
from azure .identity import ClientSecretCredential
304
- from azure .mgmt .authorization import AuthorizationManagementClient
305
- from azure .mgmt .authorization .models import RoleAssignmentCreateParameters
306
367
from azure .mgmt .containerinstance import ContainerInstanceManagementClient
307
368
from azure .mgmt .containerinstance .models import (
308
369
Container ,
@@ -315,6 +376,8 @@ def run_azure_process(
315
376
)
316
377
from azure .mgmt .resource import ResourceManagementClient
317
378
379
+ max_undead_containers = 20
380
+
318
381
client_id = azure_json ["client_id" ]
319
382
320
383
if credential is None :
@@ -327,11 +390,15 @@ def run_azure_process(
327
390
resource_group_name = azure_json ["resource_group" ]
328
391
subscription_id = azure_json ["subscription_id" ]
329
392
330
- aci_client = ContainerInstanceManagementClient (credential , subscription_id )
393
+ aci_client = None
394
+ auth_client = None
395
+ container_groups = []
331
396
res_client = ResourceManagementClient (credential , subscription_id )
332
397
333
398
# If this first call fails, then allow the Exception to propagate.
334
- resource_group = res_client .resource_groups .get (resource_group_name )
399
+ resource_group_location = res_client .resource_groups .get (
400
+ resource_group_name
401
+ ).location
335
402
336
403
container_resource_requests = ResourceRequests (
337
404
cpu = num_cores ,
@@ -342,7 +409,6 @@ def run_azure_process(
342
409
)
343
410
344
411
container_group_names = set ()
345
- starts = []
346
412
for runner_id in range (n_runners ):
347
413
container_group_name = f"powerlift-container-group-{ batch_id } -{ runner_id :04} "
348
414
@@ -366,85 +432,52 @@ def run_azure_process(
366
432
environment_variables = env_vars ,
367
433
)
368
434
container_group = ContainerGroup (
369
- location = resource_group . location ,
435
+ location = resource_group_location ,
370
436
containers = [container ],
371
437
os_type = OperatingSystemTypes .linux ,
372
438
restart_policy = ContainerGroupRestartPolicy .never ,
373
439
identity = {"type" : "SystemAssigned" },
374
440
)
375
441
442
+ if aci_client is None :
443
+ aci_client = ContainerInstanceManagementClient (credential , subscription_id )
444
+
376
445
while True :
377
446
try :
378
447
# begin_create_or_update returns LROPoller,
379
448
# but this is only indicates when the containter is started
380
449
started = aci_client .container_groups .begin_create_or_update (
381
- resource_group . name , container_group_name , container_group
450
+ resource_group_name , container_group_name , container_group
382
451
)
383
452
break
384
453
except HttpResponseError :
385
454
time .sleep (1 )
386
455
387
- starts .append (started )
388
-
389
456
container_group_names .add (container_group_name )
457
+ heappush (container_groups , (datetime .now (), container_group_name , started ))
458
+ aci_client , auth_client = assign_delete_permissions (
459
+ aci_client ,
460
+ auth_client ,
461
+ max_undead_containers ,
462
+ credential ,
463
+ subscription_id ,
464
+ client_id ,
465
+ resource_group_name ,
466
+ container_groups ,
467
+ )
390
468
391
- # make sure they have all started before exiting the process
392
- for started in starts :
393
- while True :
394
- try :
395
- while not started .done ():
396
- time .sleep (1 )
397
- break
398
- except HttpResponseError :
399
- time .sleep (1 )
469
+ assign_delete_permissions (
470
+ aci_client ,
471
+ auth_client ,
472
+ 0 ,
473
+ credential ,
474
+ subscription_id ,
475
+ client_id ,
476
+ resource_group_name ,
477
+ container_groups ,
478
+ )
400
479
401
480
if delete_group_container_on_complete :
402
- auth_client = AuthorizationManagementClient (credential , subscription_id )
403
-
404
- # Contributor Role
405
- role_definition_id = f"/subscriptions/{ subscription_id } /providers/Microsoft.Authorization/roleDefinitions/b24988ac-6180-42a0-ab88-20f7382dd24c"
406
-
407
- for container_group_name in container_group_names :
408
- while True :
409
- try :
410
- container_group = aci_client .container_groups .get (
411
- resource_group_name , container_group_name
412
- )
413
- break
414
- except HttpResponseError :
415
- time .sleep (1 )
416
-
417
- scope = f"/subscriptions/{ subscription_id } /resourceGroups/{ resource_group_name } /providers/Microsoft.ContainerInstance/containerGroups/{ container_group_name } "
418
- role_assignment_params = RoleAssignmentCreateParameters (
419
- role_definition_id = role_definition_id ,
420
- principal_id = container_group .identity .principal_id ,
421
- principal_type = "ServicePrincipal" ,
422
- )
423
-
424
- while True :
425
- try :
426
- auth_client .role_assignments .create (
427
- scope , str (uuid .uuid4 ()), role_assignment_params
428
- )
429
- break
430
- except HttpResponseError :
431
- time .sleep (1 )
432
-
433
- role_assignment_params = RoleAssignmentCreateParameters (
434
- role_definition_id = role_definition_id ,
435
- principal_id = client_id ,
436
- principal_type = "User" ,
437
- )
438
-
439
- while True :
440
- try :
441
- auth_client .role_assignments .create (
442
- scope , str (uuid .uuid4 ()), role_assignment_params
443
- )
444
- break
445
- except HttpResponseError :
446
- time .sleep (1 )
447
-
448
481
deletes = []
449
482
while len (container_group_names ) != 0 :
450
483
remove_after = []
@@ -466,7 +499,7 @@ def run_azure_process(
466
499
)
467
500
deletes .append (deleted )
468
501
remove_after .append (container_group_name )
469
- except ( HttpResponseError , MaybeEncodingError ) :
502
+ except HttpResponseError :
470
503
pass
471
504
472
505
for container_group_name in remove_after :
0 commit comments