15
15
# See the License for the specific language governing permissions and
16
16
# limitations under the License.
17
17
18
- from typing import List , Optional , Dict
18
+ from functools import lru_cache
19
+ from typing import List , Optional , Dict , Any
19
20
import argparse
20
21
from datetime import timedelta
21
22
import shlex
33
34
chunked ,
34
35
ensure_execute ,
35
36
execute_with_futures ,
36
- get_insert_operations ,
37
37
log_api_request ,
38
38
map_with_futures ,
39
39
run ,
@@ -375,6 +375,40 @@ def resume_nodes(nodes: List[str], resume_data: Optional[ResumeData]):
375
375
_handle_bulk_insert_op (op , grouped_nodes [group ].nodes , resume_data )
376
376
377
377
378
+ def _get_failed_zonal_instance_inserts (bulk_op : Any , zone : str , lkp : util .Lookup ) -> list [Any ]:
379
+ group_id = bulk_op ["operationGroupId" ]
380
+ user = bulk_op ["user" ]
381
+ started = bulk_op ["startTime" ]
382
+ ended = bulk_op ["endTime" ]
383
+
384
+ fltr = f'(user eq "{ user } ") AND (operationType eq "insert") AND (creationTimestamp > "{ started } ") AND (creationTimestamp < "{ ended } ")'
385
+ act = lkp .compute .zoneOperations ()
386
+ req = act .list (project = lkp .project , zone = zone , filter = fltr )
387
+ ops = []
388
+ while req is not None :
389
+ result = util .ensure_execute (req )
390
+ for op in result .get ("items" , []):
391
+ if op .get ("operationGroupId" ) == group_id and "error" in op :
392
+ ops .append (op )
393
+ req = act .list_next (req , result )
394
+ return ops
395
+
396
+
397
+ def _get_failed_instance_inserts (bulk_op : Any , lkp : util .Lookup ) -> list [Any ]:
398
+ zones = set () # gather zones that had failed inserts
399
+ for loc , stat in bulk_op .get ("instancesBulkInsertOperationMetadata" , {}).get ("perLocationStatus" , {}).items ():
400
+ pref , zone = loc .split ("/" , 1 )
401
+ if not pref == "zones" :
402
+ log .error (f"Unexpected location: { loc } in operation { bulk_op ['name' ]} " )
403
+ continue
404
+ if stat .get ("targetVmCount" , 0 ) != stat .get ("createdVmCount" , 0 ):
405
+ zones .add (zone )
406
+
407
+ res = []
408
+ for zone in zones :
409
+ res .extend (_get_failed_zonal_instance_inserts (bulk_op , zone , lkp ))
410
+ return res
411
+
378
412
def _handle_bulk_insert_op (op : Dict , nodes : List [str ], resume_data : Optional [ResumeData ]) -> None :
379
413
"""
380
414
Handles **DONE** BulkInsert operations
@@ -384,10 +418,9 @@ def _handle_bulk_insert_op(op: Dict, nodes: List[str], resume_data: Optional[Res
384
418
group_id = op ["operationGroupId" ]
385
419
if "error" in op :
386
420
error = op ["error" ]["errors" ][0 ]
387
- log .warning (
421
+ log .error (
388
422
f"bulkInsert operation error: { error ['code' ]} name={ op ['name' ]} operationGroupId={ group_id } nodes={ to_hostlist (nodes )} "
389
423
)
390
- # TODO: does it make sense to query for insert-ops in case of bulkInsert-op error?
391
424
392
425
created = 0
393
426
for status in op ["instancesBulkInsertOperationMetadata" ]["perLocationStatus" ].values ():
@@ -396,18 +429,13 @@ def _handle_bulk_insert_op(op: Dict, nodes: List[str], resume_data: Optional[Res
396
429
log .info (f"created { len (nodes )} instances: nodes={ to_hostlist (nodes )} " )
397
430
return # no need to gather status of insert-operations.
398
431
399
- # TODO:
400
- # * don't perform globalOperations aggregateList request to gather insert-operations,
401
- # instead use specific locations from `instancesBulkInsertOperationMetadata`,
402
- # most of the time single zone should be sufficient.
403
- # * don't gather insert-operations per bulkInsert request, instead aggregate it across
404
- # all bulkInserts (goes one level above this function)
405
- successful_inserts , failed_inserts = separate (
406
- lambda op : "error" in op , get_insert_operations (group_id )
407
- )
408
- # Apparently multiple errors are possible... so join with +.
432
+ # TODO: don't gather insert-operations per bulkInsert request, instead aggregate it
433
+ # across all bulkInserts (goes one level above this function)
434
+ failed = _get_failed_instance_inserts (op , util .lookup ())
435
+
436
+ # Multiple errors are possible, group by all of them (joined string codes)
409
437
by_error_inserts = util .groupby_unsorted (
410
- failed_inserts ,
438
+ failed ,
411
439
lambda op : "+" .join (err ["code" ] for err in op ["error" ]["errors" ]),
412
440
)
413
441
for code , failed_ops in by_error_inserts :
@@ -428,10 +456,6 @@ def _handle_bulk_insert_op(op: Dict, nodes: List[str], resume_data: Optional[Res
428
456
f"errors from insert for node '{ failed_nodes [0 ]} ' ({ failed_ops [0 ]['name' ]} ): { msg } "
429
457
)
430
458
431
- ready_nodes = {trim_self_link (op ["targetLink" ]) for op in successful_inserts }
432
- if len (ready_nodes ) > 0 :
433
- log .info (f"created { len (ready_nodes )} instances: nodes={ to_hostlist (ready_nodes )} " )
434
-
435
459
436
460
def down_nodes_notify_jobs (nodes : List [str ], reason : str , resume_data : Optional [ResumeData ]) -> None :
437
461
"""set nodes down with reason"""
0 commit comments