Skip to content

Commit 10f70bf

Browse files
dipannita08copybara-github
authored andcommitted
BEGIN_PUBLIC Catch and log rolling window monitoring exceptions. END_PUBLIC
This change changes non-fatal error logs to warnings. PiperOrigin-RevId: 788504355
1 parent 4268bd2 commit 10f70bf

File tree

1 file changed

+41
-19
lines changed

1 file changed

+41
-19
lines changed

ml_goodput_measurement/src/monitoring.py

Lines changed: 41 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -415,19 +415,34 @@ def _query_and_upload_goodput(self):
415415
)
416416
)
417417
except Exception as e: # pylint: disable=broad-exception-caught
418-
logger.error(
418+
logger.warning(
419419
'Error while querying goodput. Skipping this cycle. Error: %s', e
420420
)
421421
continue
422-
# Upload metrics to Tensorboard.
423-
self._upload_goodput_metrics_to_tensorboard(
424-
job_goodput, job_badput_breakdown, last_step
425-
)
426422

427-
# Upload metrics to Google Cloud Monitoring.
428-
if self._gcp_options.enable_gcp_goodput_metrics:
429-
self._upload_goodput_metrics_to_gcm(
430-
self._goodput_calculator.get_job_goodput_details()
423+
try:
424+
# Upload metrics to Tensorboard.
425+
self._upload_goodput_metrics_to_tensorboard(
426+
job_goodput, job_badput_breakdown, last_step
427+
)
428+
except Exception as e: # pylint: disable=broad-exception-caught
429+
logger.warning(
430+
'Could not upload goodput metrics to Tensorboard. Skipping'
431+
' this cycle. Error: %s',
432+
e,
433+
)
434+
435+
try:
436+
# Upload metrics to Google Cloud Monitoring.
437+
if self._gcp_options.enable_gcp_goodput_metrics:
438+
self._upload_goodput_metrics_to_gcm(
439+
self._goodput_calculator.get_job_goodput_details()
440+
)
441+
except Exception as e: # pylint: disable=broad-exception-caught
442+
logger.warning(
443+
'Could not upload goodput metrics to Google Cloud Monitoring.'
444+
' Skipping this cycle. Error: %s',
445+
e,
431446
)
432447

433448
def _final_goodput_query_and_upload(self):
@@ -756,17 +771,24 @@ def _query_and_upload_rolling_window_goodput(self):
756771
if not self._gcp_options.enable_gcp_goodput_metrics:
757772
continue
758773

759-
now = datetime.datetime.now(datetime.timezone.utc)
760-
for window_size in self._rolling_windows:
761-
window_end = now
762-
window_start = now - datetime.timedelta(seconds=window_size)
763-
window_start = window_start.replace(tzinfo=datetime.timezone.utc)
764-
interval_metric_details = (
765-
self._goodput_calculator.get_interval_metric_details(
766-
window_start, window_end
767-
)
774+
try:
775+
now = datetime.datetime.now(datetime.timezone.utc)
776+
for window_size in self._rolling_windows:
777+
window_end = now
778+
window_start = now - datetime.timedelta(seconds=window_size)
779+
window_start = window_start.replace(tzinfo=datetime.timezone.utc)
780+
interval_metric_details = (
781+
self._goodput_calculator.get_interval_metric_details(
782+
window_start, window_end
783+
)
784+
)
785+
self._upload_interval_goodput_metrics_to_gcm(interval_metric_details)
786+
except Exception as e: # pylint: disable=broad-exception-caught
787+
logger.warning(
788+
'Error while querying and uploading rolling window goodput to GCM.'
789+
'Skipping this cycle. This will not impact the workload. Error: %s',
790+
e,
768791
)
769-
self._upload_interval_goodput_metrics_to_gcm(interval_metric_details)
770792

771793
def start_rolling_window_goodput_uploader(
772794
self, rolling_windows_seconds: list[int]

0 commit comments

Comments
 (0)