15
15
# See the License for the specific language governing permissions and
16
16
# limitations under the License.
17
17
18
- from typing import List , Optional , Dict
18
+ from typing import List , Optional , Dict , Any
19
19
import argparse
20
20
from datetime import timedelta
21
21
import shlex
33
33
chunked ,
34
34
ensure_execute ,
35
35
execute_with_futures ,
36
- get_insert_operations ,
37
36
log_api_request ,
38
37
map_with_futures ,
39
38
run ,
@@ -375,6 +374,40 @@ def resume_nodes(nodes: List[str], resume_data: Optional[ResumeData]):
375
374
_handle_bulk_insert_op (op , grouped_nodes [group ].nodes , resume_data )
376
375
377
376
377
+ def _get_failed_zonal_instance_inserts (bulk_op : Any , zone : str , lkp : util .Lookup ) -> list [Any ]:
378
+ group_id = bulk_op ["operationGroupId" ]
379
+ user = bulk_op ["user" ]
380
+ started = bulk_op ["startTime" ]
381
+ ended = bulk_op ["endTime" ]
382
+
383
+ fltr = f'(user eq "{ user } ") AND (operationType eq "insert") AND (creationTimestamp > "{ started } ") AND (creationTimestamp < "{ ended } ")'
384
+ act = lkp .compute .zoneOperations ()
385
+ req = act .list (project = lkp .project , zone = zone , filter = fltr )
386
+ ops = []
387
+ while req is not None :
388
+ result = util .ensure_execute (req )
389
+ for op in result .get ("items" , []):
390
+ if op .get ("operationGroupId" ) == group_id and "error" in op :
391
+ ops .append (op )
392
+ req = act .list_next (req , result )
393
+ return ops
394
+
395
+
396
+ def _get_failed_instance_inserts (bulk_op : Any , lkp : util .Lookup ) -> list [Any ]:
397
+ zones = set () # gather zones that had failed inserts
398
+ for loc , stat in bulk_op .get ("instancesBulkInsertOperationMetadata" , {}).get ("perLocationStatus" , {}).items ():
399
+ pref , zone = loc .split ("/" , 1 )
400
+ if not pref == "zones" :
401
+ log .error (f"Unexpected location: { loc } in operation { bulk_op ['name' ]} " )
402
+ continue
403
+ if stat .get ("targetVmCount" , 0 ) != stat .get ("createdVmCount" , 0 ):
404
+ zones .add (zone )
405
+
406
+ res = []
407
+ for zone in zones :
408
+ res .extend (_get_failed_zonal_instance_inserts (bulk_op , zone , lkp ))
409
+ return res
410
+
378
411
def _handle_bulk_insert_op (op : Dict , nodes : List [str ], resume_data : Optional [ResumeData ]) -> None :
379
412
"""
380
413
Handles **DONE** BulkInsert operations
@@ -384,10 +417,9 @@ def _handle_bulk_insert_op(op: Dict, nodes: List[str], resume_data: Optional[Res
384
417
group_id = op ["operationGroupId" ]
385
418
if "error" in op :
386
419
error = op ["error" ]["errors" ][0 ]
387
- log .warning (
420
+ log .error (
388
421
f"bulkInsert operation error: { error ['code' ]} name={ op ['name' ]} operationGroupId={ group_id } nodes={ to_hostlist (nodes )} "
389
422
)
390
- # TODO: does it make sense to query for insert-ops in case of bulkInsert-op error?
391
423
392
424
created = 0
393
425
for status in op ["instancesBulkInsertOperationMetadata" ]["perLocationStatus" ].values ():
@@ -396,18 +428,13 @@ def _handle_bulk_insert_op(op: Dict, nodes: List[str], resume_data: Optional[Res
396
428
log .info (f"created { len (nodes )} instances: nodes={ to_hostlist (nodes )} " )
397
429
return # no need to gather status of insert-operations.
398
430
399
- # TODO:
400
- # * don't perform globalOperations aggregateList request to gather insert-operations,
401
- # instead use specific locations from `instancesBulkInsertOperationMetadata`,
402
- # most of the time single zone should be sufficient.
403
- # * don't gather insert-operations per bulkInsert request, instead aggregate it across
404
- # all bulkInserts (goes one level above this function)
405
- successful_inserts , failed_inserts = separate (
406
- lambda op : "error" in op , get_insert_operations (group_id )
407
- )
408
- # Apparently multiple errors are possible... so join with +.
431
+ # TODO: don't gather insert-operations per bulkInsert request, instead aggregate it
432
+ # across all bulkInserts (goes one level above this function)
433
+ failed = _get_failed_instance_inserts (op , util .lookup ())
434
+
435
+ # Multiple errors are possible, group by all of them (joined string codes)
409
436
by_error_inserts = util .groupby_unsorted (
410
- failed_inserts ,
437
+ failed ,
411
438
lambda op : "+" .join (err ["code" ] for err in op ["error" ]["errors" ]),
412
439
)
413
440
for code , failed_ops in by_error_inserts :
@@ -428,10 +455,6 @@ def _handle_bulk_insert_op(op: Dict, nodes: List[str], resume_data: Optional[Res
428
455
f"errors from insert for node '{ failed_nodes [0 ]} ' ({ failed_ops [0 ]['name' ]} ): { msg } "
429
456
)
430
457
431
- ready_nodes = {trim_self_link (op ["targetLink" ]) for op in successful_inserts }
432
- if len (ready_nodes ) > 0 :
433
- log .info (f"created { len (ready_nodes )} instances: nodes={ to_hostlist (ready_nodes )} " )
434
-
435
458
436
459
def down_nodes_notify_jobs (nodes : List [str ], reason : str , resume_data : Optional [ResumeData ]) -> None :
437
460
"""set nodes down with reason"""
0 commit comments