@@ -415,19 +415,34 @@ def _query_and_upload_goodput(self):
415415 )
416416 )
417417 except Exception as e : # pylint: disable=broad-exception-caught
418- logger .error (
418+ logger .warning (
419419 'Error while querying goodput. Skipping this cycle. Error: %s' , e
420420 )
421421 continue
422- # Upload metrics to Tensorboard.
423- self ._upload_goodput_metrics_to_tensorboard (
424- job_goodput , job_badput_breakdown , last_step
425- )
426422
427- # Upload metrics to Google Cloud Monitoring.
428- if self ._gcp_options .enable_gcp_goodput_metrics :
429- self ._upload_goodput_metrics_to_gcm (
430- self ._goodput_calculator .get_job_goodput_details ()
423+ try :
424+ # Upload metrics to Tensorboard.
425+ self ._upload_goodput_metrics_to_tensorboard (
426+ job_goodput , job_badput_breakdown , last_step
427+ )
428+ except Exception as e : # pylint: disable=broad-exception-caught
429+ logger .warning (
430+ 'Could not upload goodput metrics to Tensorboard. Skipping'
431+ ' this cycle. Error: %s' ,
432+ e ,
433+ )
434+
435+ try :
436+ # Upload metrics to Google Cloud Monitoring.
437+ if self ._gcp_options .enable_gcp_goodput_metrics :
438+ self ._upload_goodput_metrics_to_gcm (
439+ self ._goodput_calculator .get_job_goodput_details ()
440+ )
441+ except Exception as e : # pylint: disable=broad-exception-caught
442+ logger .warning (
443+ 'Could not upload goodput metrics to Google Cloud Monitoring.'
444+ ' Skipping this cycle. Error: %s' ,
445+ e ,
431446 )
432447
433448 def _final_goodput_query_and_upload (self ):
@@ -756,17 +771,24 @@ def _query_and_upload_rolling_window_goodput(self):
756771 if not self ._gcp_options .enable_gcp_goodput_metrics :
757772 continue
758773
759- now = datetime .datetime .now (datetime .timezone .utc )
760- for window_size in self ._rolling_windows :
761- window_end = now
762- window_start = now - datetime .timedelta (seconds = window_size )
763- window_start = window_start .replace (tzinfo = datetime .timezone .utc )
764- interval_metric_details = (
765- self ._goodput_calculator .get_interval_metric_details (
766- window_start , window_end
767- )
774+ try :
775+ now = datetime .datetime .now (datetime .timezone .utc )
776+ for window_size in self ._rolling_windows :
777+ window_end = now
778+ window_start = now - datetime .timedelta (seconds = window_size )
779+ window_start = window_start .replace (tzinfo = datetime .timezone .utc )
780+ interval_metric_details = (
781+ self ._goodput_calculator .get_interval_metric_details (
782+ window_start , window_end
783+ )
784+ )
785+ self ._upload_interval_goodput_metrics_to_gcm (interval_metric_details )
786+ except Exception as e : # pylint: disable=broad-exception-caught
787+ logger .warning (
788+ 'Error while querying and uploading rolling window goodput to GCM.'
789+ 'Skipping this cycle. This will not impact the workload. Error: %s' ,
790+ e ,
768791 )
769- self ._upload_interval_goodput_metrics_to_gcm (interval_metric_details )
770792
771793 def start_rolling_window_goodput_uploader (
772794 self , rolling_windows_seconds : list [int ]
0 commit comments