@@ -354,12 +354,13 @@ class BadputType(enum.Enum):
354354 TRAINING_PREP = 2
355355 PROGRAM_STARTUP = 3
356356 DATA_LOADING_SYNC = 4
357- DATA_LOADING_ASYNC = 5 # This does not affect Goodput
357+ DATA_LOADING_ASYNC = 5
358358 UNPRODUCTIVE_CHECKPOINT_SAVE_TIME = 6
359359 UNPRODUCTIVE_CHECKPOINT_RESTORE_TIME = 7
360360 WASTED_PROGRESS_FROM_DISRUPTION = 8
361- CUSTOM_BADPUT_EVENTS = 9
362- OTHER = 10
361+ INFRASTRUCTURE_RECOVERY_FROM_DISRUPTION = 9
362+ CUSTOM_BADPUT_EVENTS = 10
363+ OTHER = 11
363364```
364365
365366#### Badput Breakdown Details
@@ -416,13 +417,20 @@ class BadputType(enum.Enum):
416417 lost after restart as well as time lost for the infrastructure to restart the
417418 workload.
418419
420+ - Infrastructure Recovery Time due to Disruption (INFRASTRUCTURE_RECOVERY_FROM_DISRUPTION)
421+
422+ This is the time taken by the infrastructure to restart the workload after a
423+ disruption. The root-cause of the disruption could be anything (application layer,
424+ infrastructure layer, hardware layer).
425+
419426 When there is a disruption, Badput is expected to accumulate in
420427 each of the following buckets after restart:
421428
422429 - Accelerator Initialization
423430 - Training Preparation
424431 - Program Startup
425432 - Wasted Progress due to Disruption
433+ - Infrastructure Recovery Time
426434
427435 - Custom Badput Events (CUSTOM_BADPUT_EVENTS)
428436
@@ -442,7 +450,8 @@ print(f"Badput due to TPU initialization: {badput_breakdown[goodput.BadputType.T
442450print (f " Badput due to training preparation: { badput_breakdown[goodput.BadputType.TRAINING_PREP ]:.2f } % " )
443451print (f " Badput due to program startup: { badput_breakdown[goodput.BadputType.PROGRAM_STARTUP ]:.2f } % " )
444452print (f " Badput due to data loading: { badput_breakdown[goodput.BadputType.DATA_LOADING_SYNC ]:.2f } % " )
445- print (f " Badput due to disruption and wasted progress: { badput_breakdown[goodput.BadputType.WASTED_PROGRESS_FROM_DISRUPTION ]:.2f } % " )
453+ print (f " Badput due to wasted progress from disruption: { badput_breakdown[goodput.BadputType.WASTED_PROGRESS_FROM_DISRUPTION ]:.2f } % " )
454+ print (f " Badput due to infrastructure recovery from disruption: { badput_breakdown[goodput.BadputType.INFRASTRUCTURE_RECOVERY_FROM_DISRUPTION ]:.2f } % " )
446455print (f " Badput due to checkpoint save: { badput_breakdown[goodput.BadputType.UNPRODUCTIVE_CHECKPOINT_SAVE_TIME ]:.2f } % " )
447456print (f " Badput due to checkpoint restore: { badput_breakdown[goodput.BadputType.UNPRODUCTIVE_CHECKPOINT_RESTORE_TIME ]:.2f } % " )
448457print (f " Badput due to step evaluation: { badput_breakdown[goodput.BadputType.CUSTOM_BADPUT_EVENTS ].get(' EVAL_STEP' , 0.0 ):.2f } % " )
0 commit comments